Project

General

Profile

Bug #15121 » preg_t3lib_parsehtml.diff

Administrator Admin, 2005-10-27 00:46

View differences:

TYPO3core.pregapl/t3lib/class.t3lib_parsehtml.php 2005-10-24 21:37:41.223514816 +0200
* @package TYPO3
* @subpackage t3lib
*/
class t3lib_parsehtml {
class t3lib_parsehtml {
var $caseShift_cache=array();
......
* @return string
*/
function getSubpart($content, $marker) {
if ($marker && strstr($content,$marker)) {
$start = strpos($content, $marker)+strlen($marker);
$stop = @strpos($content, $marker, $start+1);
$sub = substr($content, $start, $stop-$start);
$reg=Array();
ereg('^[^<]*-->',$sub,$reg);
$start+=strlen($reg[0]);
$reg=Array();
ereg('<!--[^>]*$',$sub,$reg);
$stop-=strlen($reg[0]);
return substr($content, $start, $stop-$start);
$start = strpos($content, $marker);
if ($start===false) { return ''; }
$start += strlen($marker);
$stop = strpos($content, $marker, $start);
// Q: What shall get returned if no stop marker is given /*everything till the end*/ or nothing
if ($stop===false) { return /*substr($content, $start)*/ ''; }
$content = substr($content, $start, $stop-$start);
if (preg_match('/^([^\<\>]*\-\-\>)?(.*?)(\<\!\-\-[^\<\>]*)?$/s', $content, $matches)===1) {
return $matches[2];
}
return $content;
}
/**
......
*/
function substituteSubpart($content,$marker,$subpartContent,$recursive=1,$keepMarker=0) {
$start = strpos($content, $marker);
$stop = @strpos($content, $marker, $start+1)+strlen($marker);
if ($start && $stop>$start) {
// code before
$before = substr($content, 0, $start);
$reg=Array();
ereg('<!--[^>]*$',$before,$reg);
$start-=strlen($reg[0]);
if ($keepMarker) {
$reg_k=Array();
if ($reg[0]) ereg('^[^>]*-->',substr($content,$start),$reg_k);
$before_marker = substr($content, $start, strlen($reg_k[0]?$reg_k[0]:$marker));
}
$before = substr($content, 0, $start);
// code after
$after = substr($content, $stop);
$reg=Array();
ereg('^[^<]*-->',$after,$reg);
$stop+=strlen($reg[0]);
if ($keepMarker) {
$reg_k=Array();
if ($reg[0]) ereg('<!--[^<]*$',substr($content,0,$stop),$reg_k);
$sLen = strlen($reg_k[0]?$reg_k[0]:$marker);
$after_marker = substr($content, $stop-$sLen,$sLen);
if ($start===false) { return $content; }
$startAM = $start+strlen($marker);
$stop = strpos($content, $marker, $startAM);
if ($stop===false) { return $content; }
$stopAM = $stop+strlen($marker);
$before = substr($content, 0, $start);
$after = substr($content, $stopAM);
$between = substr($content, $startAM, $stop-$startAM);
if ($recursive) {
$after = $this->substituteSubpart($after, $marker, $subpartContent, $recursive, $keepMarker);
}
if ($keepMarker) {
if (preg_match('/^([^\<\>]*\-\-\>)?(.*?)(\<\!\-\-[^\<\>]*)?$/s', $between, $matches)===1) {
$before .= $marker.$matches[1];
$between = $matches[2];
$after = $matches[3].$marker.$after;
} else {
$before .= $marker;
$before .= $marker.$after;
}
} else {
if (preg_match('/^(.*)\<\!\-\-[^\<\>]*$/s', $before, $matches)===1) {
$before = $matches[1];
}
$after = substr($content, $stop);
// replace?
if (is_array($subpartContent)) {
$substContent=$subpartContent[0].$this->getSubpart($content,$marker).$subpartContent[1];
} else {
$substContent=$subpartContent;
if (preg_match('/^([^\<\>]*\-\-\>)?(.*?)(\<\!\-\-[^\<\>]*)?$/s', $between, $matches)===1) {
$between = $matches[2];
}
}
if ($recursive && strpos($after, $marker)) {
return $before.($keepMarker?$before_marker:'').$substContent.($keepMarker?$after_marker:'').$this->substituteSubpart($after,$marker,$subpartContent);
} else {
return $before.($keepMarker?$before_marker:'').$substContent.($keepMarker?$after_marker:'').$after;
if (preg_match('/^[^\<\>]*\-\-\>(.*)$/s', $after, $matches)===1) {
$after = $matches[1];
}
} else {
return $content;
}
if (is_array($subpartContent)) {
$between = $subpartContent[0].$between.$subpartContent[1];
} else {
$between = $subpartContent;
}
return $before.$between.$after;
}
// *******************************************'
// COPY FROM class.tslib_content.php: / END
// *******************************************'
......
*/
function splitIntoBlock($tag,$content,$eliminateExtraEndTags=0) {
$tags=array_unique(t3lib_div::trimExplode(',',$tag,1));
$regexStr = '</?('.implode('|',$tags).')(>|[[:space:]][^>]*>)';
$regexStr = '/\<\/?('.implode('|', $tags).')(\s*\>|\s[^\>]*\>)/si';
$parts = spliti($regexStr,$content);
$parts = preg_split($regexStr, $content);
$newParts=array();
$pointer=strlen($parts[0]);
......
*/
function splitTags($tag,$content) {
$tags = t3lib_div::trimExplode(',',$tag,1);
$regexStr = '<('.implode('|',$tags).')(>|\/>|[[:space:]][^>]*>)';
$parts = spliti($regexStr,$content);
$regexStr = '/\<('.implode('|', $tags).')(\s[^>]*)?\/?>/si';
$parts = preg_split($regexStr, $content);
$pointer = strlen($parts[0]);
$newParts = array();
......
* @see splitIntoBlock(), splitTags()
*/
function getAllParts($parts,$tag_parts=1,$include_tag=1) {
reset($parts);
$newParts=array();
while(list($k,$v)=each($parts)) {
foreach ($parts as $k => $v) {
if (($k+($tag_parts?0:1))%2) {
if (!$include_tag) $v=$this->removeFirstAndLastTag($v);
$newParts[]=$v;
......
/**
* Removes the first and last tag in the string
* Anything before and after the first and last tags respectively is also removed
* Anything before the first and after the last tags respectively is also removed
*
* @param string String to process
* @return string
*/
function removeFirstAndLastTag($str) {
// First:
$endLen = strcspn($str,'>')+1;
$str = substr($str,$endLen);
// Last:
$str = strrev($str);
$endLen = strcspn($str,'<')+1;
$str = substr($str,$endLen);
// End of first tag:
$start = strpos($str,'>');
// Begin of last tag:
$end = strrpos($str,'<');
// return
return strrev($str);
return substr($str, $start+1, $end-$start-1);
}
/**
......
*/
function getFirstTag($str) {
// First:
$endLen = strcspn($str,'>')+1;
$str = substr($str,0,$endLen);
return $str;
$endLen = strpos($str,'>')+1;
return substr($str,0,$endLen);
}
/**
......
* @see getFirstTag()
*/
function getFirstTagName($str,$preserveCase=FALSE) {
list($tag) = split('[[:space:]]',substr(trim($this->getFirstTag($str)),1,-1), 2);
if (!$preserveCase) $tag = strtoupper($tag);
return trim($tag);
if (preg_match('/^\s*\<([^\s\>]+)(\s|\>)/', $str, $matches)===1) {
if (!$preserveCase) {
return strtoupper($matches[1]);
}
return $matches[1];
}
return '';
}
/**
......
function get_tag_attributes($tag,$deHSC=0) {
list($components,$metaC) = $this->split_tag_attributes($tag);
$name = ''; // attribute name is stored here
$valuemode = '';
$valuemode = false;
$attributes = array();
$attributesMeta = array();
if (is_array($components)) {
while (list($key,$val) = each ($components)) {
foreach ($components as $key => $val) {
if ($val != '=') { // Only if $name is set (if there is an attribute, that waits for a value), that valuemode is enabled. This ensures that the attribute is assigned it's value
if ($valuemode) {
if ($name) {
......
$name = '';
}
} else {
if ($namekey = ereg_replace('[^a-zA-Z0-9_:-]','',$val)) {
if ($namekey = preg_replace('/[^[:alnum:]_\:\-]/','',$val)) {
$name = strtolower($namekey);
$attributesMeta[$name]=array();
$attributesMeta[$name]['origTag']=$namekey;
$attributes[$name] = '';
}
}
$valuemode = '';
$valuemode = false;
} else {
$valuemode = 'on';
$valuemode = true;
}
}
if (is_array($attributes)) reset($attributes);
return array($attributes,$attributesMeta);
}
}
......
* @see t3lib_div::split_tag_attributes()
*/
function split_tag_attributes($tag) {
$tag_tmp = trim(eregi_replace ('^<[^[:space:]]*','',trim($tag)));
// Removes any > in the end of the string
$tag_tmp = trim(eregi_replace ('>$','',$tag_tmp));
if (preg_match('/(\<[^\s]+\s+)?(.*?)\s*(\>)?$/s', $tag, $matches)!==1) {
return array(array(), array());
}
$tag_tmp = $matches[2];
$metaValue = array();
$value = array();
while (strcmp($tag_tmp,'')) { // Compared with empty string instead , 030102
$firstChar=substr($tag_tmp,0,1);
if (!strcmp($firstChar,'"') || !strcmp($firstChar,"'")) {
$reg=explode($firstChar,$tag_tmp,3);
$value[]=$reg[1];
$metaValue[]=$firstChar;
$tag_tmp=trim($reg[2]);
} elseif (!strcmp($firstChar,'=')) {
$value[] = '=';
$metaValue[]='';
$tag_tmp = trim(substr($tag_tmp,1)); // Removes = chars.
} else {
// There are '' around the value. We look for the next ' ' or '>'
$reg = split('[[:space:]=]',$tag_tmp,2);
$value[] = trim($reg[0]);
$metaValue[]='';
$tag_tmp = trim(substr($tag_tmp,strlen($reg[0]),1).$reg[1]);
if (preg_match_all('/("[^"]*"|\'[^\']*\'|[^\s"\'\=]+|\=)/s', $tag_tmp, $matches)>0) {
foreach ($matches[1] as $part) {
$firstChar = substr($part, 0, 1);
if ($firstChar=='"' || $firstChar=="'") {
$metaValue[] = $firstChar;
$value[] = substr($part, 1, -1);
} else {
$metaValue[] = '';
$value[] = $part;
}
}
}
if (is_array($value)) reset($value);
return array($value,$metaValue);
}
......
// Block tags, must have endings...
$blockTags = explode(',',$blockTags);
foreach($blockTags as $tagName) {
$countBegin = count(split('<'.$tagName.'[^[:alnum:]]',$content))-1;
$countEnd = count(split('<\/'.$tagName.'[^[:alnum:]]',$content))-1;
$countBegin = count(preg_split('/\<'.$tagName.'(\s|\>)/s',$content))-1;
$countEnd = count(preg_split('/\<\/'.$tagName.'(\s|\>)/s',$content))-1;
$analyzedOutput['blocks'][$tagName]=array($countBegin,$countEnd,$countBegin-$countEnd);
if ($countBegin) $analyzedOutput['counts'][$tagName]=$countBegin;
if ($countBegin-$countEnd) {
......
// Solo tags, must NOT have endings...
$soloTags = explode(',',$soloTags);
foreach($soloTags as $tagName) {
$countBegin = count(split('<'.$tagName.'[^[:alnum:]]',$content))-1;
$countEnd = count(split('<\/'.$tagName.'[^[:alnum:]]',$content))-1;
$countBegin = count(preg_split('/\<'.$tagName.'(\s|\>)/s',$content))-1;
$countEnd = count(preg_split('/\<\/'.$tagName.'(\s|\>)/s',$content))-1;
$analyzedOutput['solo'][$tagName]=array($countBegin,$countEnd);
if ($countBegin) $analyzedOutput['counts'][$tagName]=$countBegin;
if ($countEnd) {
......
while(list(,$tok)=each($tokArr)) {
$firstChar = substr($tok,0,1);
# if (strcmp(trim($firstChar),'')) { // It is a tag...
if (ereg('[[:alnum:]\/]',$firstChar)) { // It is a tag... (first char is a-z0-9 or /) (fixed 19/01 2004). This also avoids triggering on <?xml..> and <!DOCTYPE..>
$tagEnd = strcspn($tok,'>');
if (strlen($tok)!=$tagEnd) { // If there is and end-bracket...
if (preg_match('/[[:alnum:]\/]/',$firstChar)==1) { // It is a tag... (first char is a-z0-9 or /) (fixed 19/01 2004). This also avoids triggering on <?xml..> and <!DOCTYPE..>
$tagEnd = strpos($tok,'>');
if ($tagEnd) { // If there is and end-bracket... tagEnd can't be 0 as the first character can't be a >
$endTag = $firstChar=='/' ? 1 : 0;
$tagContent = substr($tok,$endTag,$tagEnd-$endTag);
$tagParts = split('[[:space:]]',$tagContent,2);
$tagParts = preg_split('/\s+/s',$tagContent,2);
$tagName = strtolower($tagParts[0]);
if (isset($tags[$tagName])) {
if (is_array($tags[$tagName])) { // If there is processing to do for the tag:
......
$tagAttrib = $this->get_tag_attributes($tagParts[1]);
$tagParts[1]='';
$newTagAttrib = array();
$tList = t3lib_div::trimExplode(',',strtolower($tags[$tagName]['allowedAttribs']),1);
while(list(,$allowTag)=each($tList)) {
if (!($tList = $tags[$tagName]['_allowedAttribs'])) {
// Just explode attribts for tag once
$tList = $tags[$tagName]['_allowedAttribs'] = t3lib_div::trimExplode(',',strtolower($tags[$tagName]['allowedAttribs']),1);
}
foreach ($tList as $allowTag) {
if (isset($tagAttrib[0][$allowTag])) $newTagAttrib[$allowTag]=$tagAttrib[0][$allowTag];
}
$tagParts[1]=$this->compileTagAttribs($newTagAttrib,$tagAttrib[1]);
......
}
// Unsetting tags:
reset($tagRegister);
while(list($tag,$positions)=each($tagRegister)) {
reset($positions);
while(list(,$pKey)=each($positions)) {
foreach ($tagRegister as $tag => $positions) {
foreach ($positions as $pKey) {
unset($newContent[$pKey]);
}
}
......
function prefixResourcePath($main_prefix,$content,$alternatives=array(),$suffix='') {
$parts = $this->splitTags('embed,td,table,body,img,input,form,link,script,a',$content);
foreach($parts as $k => $v) {
foreach ($parts as $k => $v) {
if ($k%2) {
$params = $this->get_tag_attributes($v,1);
$tagEnd = substr($v,-2)=='/>' ? ' />' : '>'; // Detect tag-ending so that it is re-applied correctly.
......
break;
}
if ($somethingDone) {
$tagParts = split('[[:space:]]',$v,2);
$tagParts = preg_split('/\s+/s',$v,2);
$tagParts[1]=$this->compileTagAttribs($params[0],$params[1]);
$parts[$k] = '<'.trim(strtolower($firstTagName).' '.$tagParts[1]).
$tagEnd;
$parts[$k] = '<'.trim(strtolower($firstTagName).' '.$tagParts[1]).$tagEnd;
}
}
}
......
*/
function cleanFontTags($value,$keepFace=0,$keepSize=0,$keepColor=0) {
$fontSplit = $this->splitIntoBlock('font',$value); // ,1 ?? - could probably be more stable if splitTags() was used since this depends on end-tags being properly set!
reset($fontSplit);
while(list($k,$v)=each($fontSplit)) {
foreach ($fontSplit as $k => $v) {
if ($k%2) { // font:
$attribArray=$this->get_tag_attributes_classic($this->getFirstTag($v));
$newAttribs=array();
......
function mapTags($value,$tags=array(),$ltChar='<',$ltChar2='<') {
foreach($tags as $from => $to) {
$value = eregi_replace($ltChar.$from.'>',$ltChar2.$to.'>',$value);
$value = eregi_replace($ltChar.$from.'[[:space:]]([^>]*)>',$ltChar2.$to.' \\1>',$value);
$value = eregi_replace($ltChar.'\/'.$from.'[^>]*>',$ltChar2.'/'.$to.'>',$value);
$value = preg_replace('/'.$preg_quote($ltChar).'(\/)?'.$from.'\s([^\>])*(\/)?\>/', $ltChar2.'$1'.$to.' $2$3>', $value);
}
return $value;
}
......
if (strlen($tok)!=$tagEnd) {
$endTag = $firstChar=='/' ? 1 : 0;
$tagContent = substr($tok,$endTag,$tagEnd-$endTag);
$tagParts = split('[[:space:]]',$tagContent,2);
$tagParts = preg_split('/\s+/s',$tagContent,2);
$tagName = strtolower($tagParts[0]);
if (!strcmp($tagList,'') || in_array($tagName,$tagsArray)) {
$contentParts[$k] = '<'.$subparts[0].'>'.$subparts[1];
......
$tags=t3lib_div::trimExplode(',',$tagList,1);
$forthArr=array();
$backArr=array();
while(list(,$theTag)=each($tags)) {
foreach ($tags as $theTag) {
$forthArr[$theTag]=md5($theTag);
$backArr[md5($theTag)]=$theTag;
}
$value = $this->mapTags($value,$forthArr,'<','_');
$value=strip_tags($value);
$value = $this->mapTags($value,$backArr,'_','<');
$value = $this->mapTags($value,$forthArr,'<','_');
$value=strip_tags($value);
$value = $this->mapTags($value,$backArr,'_','<');
return $value;
}
......
* Internal function for case shifting of a string or whole array
*
* @param mixed Input string/array
* @param boolean If $str is a string AND this boolean is true, the string is returned in uppercase
* @param boolean If $str is a string AND this boolean(caseSensitive) is false, the string is returned in uppercase
* @param string Key string used for internal caching of the results. Could be an MD5 hash of the serialized version of the input $str if that is an array.
* @return string Output string, processed
* @access private
*/
function caseShift($str,$flag,$cacheKey='') {
$cacheKey .= $flag?1:0;
if (is_array($str)) {
if (!$cacheKey || !isset($this->caseShift_cache[$cacheKey])) {
reset($str);
while(list($k)=each($str)) {
$str[$k] = strtoupper($str[$k]);
foreach ($str as $k => $v) {
if (!$flag) {
$str[$k] = strtoupper($v);
}
}
if ($cacheKey) $this->caseShift_cache[$cacheKey]=$str;
} else {
$str = $this->caseShift_cache[$cacheKey];
}
} elseif (!$flag) $str = strtoupper($str);
} elseif (!$flag) { $str = strtoupper($str); }
return $str;
}
......
*/
function compileTagAttribs($tagAttrib,$meta=array(), $xhtmlClean=0) {
$accu=array();
reset($tagAttrib);
while(list($k,$v)=each($tagAttrib)) {
foreach ($tagAttrib as $k =>$v) {
if ($xhtmlClean) {
$attr=strtolower($k);
if (strcmp($v,'') || isset($meta[$k]['dashType'])) {
......
function indentLines($content, $number=1, $indentChar="\t") {
$preTab = str_pad('', $number*strlen($indentChar), $indentChar);
$lines = explode(chr(10),str_replace(chr(13),'',$content));
while(list($k,$v) = each($lines)) {
foreach ($lines as $k => $v) {
$lines[$k] = $preTab.$v;
}
return implode(chr(10), $lines);
......
}
reset($TSconfig['tags.']);
while(list($key,$tagC)=each($TSconfig['tags.'])) {
foreach ($TSconfig['tags.'] as $key => $tagC) {
if (is_array($tagC) && $key==strtolower($key)) {
$key=substr($key,0,-1);
if (!is_array($keepTags[$key])) $keepTags[$key]=array();
......
if ($conf['xhtml']) {
if ($endTag) { // Endtags are just set lowercase right away
$value = strtolower($value);
} elseif (substr($value,0,2)!='<!') { // ... and comments are ignored.
} elseif (substr($value,0,4)!='<!--') { // ... and comments are ignored.
$inValue = substr($value,1,(substr($value,-2)=='/>'?-2:-1)); // Finding inner value with out < >
list($tagName,$tagP)=split('[[:space:]]',$inValue,2); // Separate attributes and tagname
list($tagName,$tagP)=preg_split('/\s+/s',$inValue,2); // Separate attributes and tagname
$tagName = strtolower($tagName);
// Process attributes
(1-1/2)