|
<?php
|
|
/***************************************************************
|
|
* Copyright notice
|
|
*
|
|
* (c) 1999-2006 Kasper Skaarhoj (kasperYYYY@typo3.com)
|
|
* All rights reserved
|
|
*
|
|
* This script is part of the TYPO3 project. The TYPO3 project is
|
|
* free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* The GNU General Public License can be found at
|
|
* http://www.gnu.org/copyleft/gpl.html.
|
|
* A copy is found in the textfile GPL.txt and important notices to the license
|
|
* from the author is found in LICENSE.txt distributed with these scripts.
|
|
*
|
|
*
|
|
* This script is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* This copyright notice MUST APPEAR in all copies of the script!
|
|
***************************************************************/
|
|
/**
|
|
* Contains class with functions for parsing HTML code.
|
|
*
|
|
* $Id: class.t3lib_parsehtml.php,v 1.25.2.4 2006/03/22 01:11:00 typo3 Exp $
|
|
* Revised for TYPO3 3.6 July/2003 by Kasper Skaarhoj
|
|
*
|
|
* @author Kasper Skaarhoj <kasperYYYY@typo3.com>
|
|
*/
|
|
/**
|
|
* [CLASS/FUNCTION INDEX of SCRIPT]
|
|
*
|
|
*
|
|
*
|
|
* 106: class t3lib_parsehtml
|
|
* 123: function getSubpart($content, $marker)
|
|
* 156: function substituteSubpart($content,$marker,$subpartContent,$recursive=1,$keepMarker=0)
|
|
*
|
|
* SECTION: Parsing HTML code
|
|
* 247: function splitIntoBlock($tag,$content,$eliminateExtraEndTags=0)
|
|
* 308: function splitIntoBlockRecursiveProc($tag,$content,&$procObj,$callBackContent,$callBackTags,$level=0)
|
|
* 344: function splitTags($tag,$content)
|
|
* 378: function getAllParts($parts,$tag_parts=1,$include_tag=1)
|
|
* 396: function removeFirstAndLastTag($str)
|
|
* 412: function getFirstTag($str)
|
|
* 426: function getFirstTagName($str,$preserveCase=FALSE)
|
|
* 445: function get_tag_attributes($tag,$deHSC=0)
|
|
* 486: function split_tag_attributes($tag)
|
|
* 524: function checkTagTypeCounts($content,$blockTags='a,b,blockquote,body,div,em,font,form,h1,h2,h3,h4,h5,h6,i,li,map,ol,option,p,pre,select,span,strong,table,td,textarea,tr,u,ul', $soloTags='br,hr,img,input,area')
|
|
*
|
|
* SECTION: Clean HTML code
|
|
* 617: function HTMLcleaner($content, $tags=array(),$keepAll=0,$hSC=0,$addConfig=array())
|
|
* 814: function bidir_htmlspecialchars($value,$dir)
|
|
* 837: function prefixResourcePath($main_prefix,$content,$alternatives=array(),$suffix='')
|
|
* 919: function prefixRelPath($prefix,$srcVal,$suffix='')
|
|
* 937: function cleanFontTags($value,$keepFace=0,$keepSize=0,$keepColor=0)
|
|
* 967: function mapTags($value,$tags=array(),$ltChar='<',$ltChar2='<')
|
|
* 982: function unprotectTags($content,$tagList='')
|
|
* 1015: function stripTagsExcept($value,$tagList)
|
|
* 1038: function caseShift($str,$flag,$cacheKey='')
|
|
* 1065: function compileTagAttribs($tagAttrib,$meta=array(), $xhtmlClean=0)
|
|
* 1093: function get_tag_attributes_classic($tag,$deHSC=0)
|
|
* 1106: function indentLines($content, $number=1, $indentChar="\t")
|
|
* 1123: function HTMLparserConfig($TSconfig,$keepTags=array())
|
|
* 1247: function XHTML_clean($content)
|
|
* 1269: function processTag($value,$conf,$endTag,$protected=0)
|
|
* 1315: function processContent($value,$dir,$conf)
|
|
*
|
|
* TOTAL FUNCTIONS: 28
|
|
* (This index is automatically created/updated by the extension "extdeveval")
|
|
*
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
* Functions for parsing HTML.
|
|
* You are encouraged to use this class in your own applications
|
|
*
|
|
* @author Kasper Skaarhoj <kasperYYYY@typo3.com>
|
|
* @package TYPO3
|
|
* @subpackage t3lib
|
|
*/
|
|
class t3lib_parsehtml {
|
|
var $caseShift_cache=array();
|
|
|
|
|
|
// *******************************************'
|
|
// COPY FROM class.tslib_content.php: / BEGIN
|
|
// substituteSubpart
|
|
// Cleaned locally 2/2003 !!!! (so different from tslib_content version)
|
|
// *******************************************'
|
|
|
|
/**
|
|
* Returns the first subpart encapsulated in the marker, $marker (possibly present in $content as a HTML comment)
|
|
*
|
|
* @param string Content with subpart wrapped in fx. "###CONTENT_PART###" inside.
|
|
* @param string Marker string, eg. "###CONTENT_PART###"
|
|
* @return string
|
|
*/
|
|
function getSubpart($content, $marker) {
|
|
$start = strpos($content, $marker);
|
|
if ($start===false) { return ''; }
|
|
$start += strlen($marker);
|
|
$stop = strpos($content, $marker, $start);
|
|
// Q: What shall get returned if no stop marker is given /*everything till the end*/ or nothing
|
|
if ($stop===false) { return /*substr($content, $start)*/ ''; }
|
|
$content = substr($content, $start, $stop-$start);
|
|
$matches = array();
|
|
if (preg_match('/^([^\<]*\-\-\>)(.*)(\<\!\-\-[^\>]*)$/s', $content, $matches)===1) {
|
|
return $matches[2];
|
|
}
|
|
$matches = array();
|
|
if (preg_match('/(.*)(\<\!\-\-[^\>]*)$/s', $content, $matches)===1) {
|
|
return $matches[1];
|
|
}
|
|
$matches = array();
|
|
if (preg_match('/^([^\<]*\-\-\>)(.*)$/s', $content, $matches)===1) {
|
|
return $matches[2];
|
|
}
|
|
return $content;
|
|
}
|
|
|
|
/**
|
|
* Substitutes a subpart in $content with the content of $subpartContent.
|
|
*
|
|
* @param string Content with subpart wrapped in fx. "###CONTENT_PART###" inside.
|
|
* @param string Marker string, eg. "###CONTENT_PART###"
|
|
* @param array If $subpartContent happens to be an array, it's [0] and [1] elements are wrapped around the content of the subpart (fetched by getSubpart())
|
|
* @param boolean If $recursive is set, the function calls itself with the content set to the remaining part of the content after the second marker. This means that proceding subparts are ALSO substituted!
|
|
* @param boolean If set, the marker around the subpart is not removed, but kept in the output
|
|
* @return string Processed input content
|
|
*/
|
|
function substituteSubpart($content,$marker,$subpartContent,$recursive=1,$keepMarker=0) {
|
|
$start = strpos($content, $marker);
|
|
if ($start===false) { return $content; }
|
|
$startAM = $start+strlen($marker);
|
|
$stop = strpos($content, $marker, $startAM);
|
|
if ($stop===false) { return $content; }
|
|
$stopAM = $stop+strlen($marker);
|
|
$before = substr($content, 0, $start);
|
|
$after = substr($content, $stopAM);
|
|
$between = substr($content, $startAM, $stop-$startAM);
|
|
|
|
if ($recursive) {
|
|
$after = t3lib_parsehtml::substituteSubpart($after, $marker, $subpartContent, $recursive, $keepMarker);
|
|
}
|
|
|
|
if ($keepMarker) {
|
|
$matches = array();
|
|
if (preg_match('/^([^\<]*\-\-\>)(.*)(\<\!\-\-[^\>]*)$/s', $between, $matches)===1) {
|
|
$before .= $marker.$matches[1];
|
|
$between = $matches[2];
|
|
$after = $matches[3].$marker.$after;
|
|
} elseif (preg_match('/^(.*)(\<\!\-\-[^\>]*)$/s', $between, $matches)===1) {
|
|
$before .= $marker;
|
|
$between = $matches[1];
|
|
$after = $matches[2].$marker.$after;
|
|
} elseif (preg_match('/^([^\<]*\-\-\>)(.*)$/s', $between, $matches)===1) {
|
|
$before .= $marker.$matches[1];
|
|
$between = $matches[2];
|
|
$after = $marker.$after;
|
|
} else {
|
|
$before .= $marker;
|
|
$after = $marker.$after;
|
|
}
|
|
} else {
|
|
$matches = array();
|
|
if (preg_match('/^(.*)\<\!\-\-[^\>]*$/s', $before, $matches)===1) {
|
|
$before = $matches[1];
|
|
}
|
|
if (is_array($subpartContent)) {
|
|
$matches = array();
|
|
if (preg_match('/^([^\<]*\-\-\>)(.*)(\<\!\-\-[^\>]*)$/s', $between, $matches)===1) {
|
|
$between = $matches[2];
|
|
} elseif (preg_match('/^(.*)(\<\!\-\-[^\>]*)$/s', $between, $matches)===1) {
|
|
$between = $matches[1];
|
|
} elseif (preg_match('/^([^\<]*\-\-\>)(.*)$/s', $between, $matches)===1) {
|
|
$between = $matches[2];
|
|
}
|
|
}
|
|
$matches = array();
|
|
if (preg_match('/^[^\<]*\-\-\>(.*)$/s', $after, $matches)===1) {
|
|
$after = $matches[1];
|
|
}
|
|
}
|
|
|
|
if (is_array($subpartContent)) {
|
|
$between = $subpartContent[0].$between.$subpartContent[1];
|
|
} else {
|
|
$between = $subpartContent;
|
|
}
|
|
|
|
return $before.$between.$after;
|
|
}
|
|
|
|
|
|
// *******************************************'
|
|
// COPY FROM class.tslib_content.php: / END
|
|
// *******************************************'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/************************************
|
|
*
|
|
* Parsing HTML code
|
|
*
|
|
************************************/
|
|
|
|
/**
|
|
* Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
|
|
* Even numbers in the array are outside the blocks, Odd numbers are block-content.
|
|
* Use ->getAllParts() and ->removeFirstAndLastTag() to process the content if needed.
|
|
*
|
|
* @param string List of tags, comma separated.
|
|
* @param string HTML-content
|
|
* @param boolean If set, excessive end tags are ignored - you should probably set this in most cases.
|
|
* @return array Even numbers in the array are outside the blocks, Odd numbers are block-content.
|
|
* @see splitTags(), getAllParts(), removeFirstAndLastTag()
|
|
*/
|
|
function splitIntoBlock($tag,$content,$eliminateExtraEndTags=0) {
|
|
$tags=array_unique(t3lib_div::trimExplode(',',$tag,1));
|
|
$regexStr = '/\<\/?('.implode('|', $tags).')(\s*\>|\s[^\>]*\>)/si';
|
|
|
|
$parts = preg_split($regexStr, $content);
|
|
|
|
$newParts=array();
|
|
$pointer=strlen($parts[0]);
|
|
$buffer=$parts[0];
|
|
$nested=0;
|
|
reset($parts);
|
|
next($parts);
|
|
while(list($k,$v)=each($parts)) {
|
|
$isEndTag= substr($content,$pointer,2)=='</' ? 1 : 0;
|
|
$tagLen = strcspn(substr($content,$pointer),'>')+1;
|
|
|
|
if (!$isEndTag) { // We meet a start-tag:
|
|
if (!$nested) { // Ground level:
|
|
$newParts[]=$buffer; // previous buffer stored
|
|
$buffer='';
|
|
}
|
|
$nested++; // We are inside now!
|
|
$mbuffer=substr($content,$pointer,strlen($v)+$tagLen); // New buffer set and pointer increased
|
|
$pointer+=strlen($mbuffer);
|
|
$buffer.=$mbuffer;
|
|
} else { // If we meet an endtag:
|
|
$nested--; // decrease nested-level
|
|
$eliminated=0;
|
|
if ($eliminateExtraEndTags && $nested<0) {
|
|
$nested=0;
|
|
$eliminated=1;
|
|
} else {
|
|
$buffer.=substr($content,$pointer,$tagLen); // In any case, add the endtag to current buffer and increase pointer
|
|
}
|
|
$pointer+=$tagLen;
|
|
if (!$nested && !$eliminated) { // if we're back on ground level, (and not by eliminating tags...
|
|
$newParts[]=$buffer;
|
|
$buffer='';
|
|
}
|
|
$mbuffer=substr($content,$pointer,strlen($v)); // New buffer set and pointer increased
|
|
$pointer+=strlen($mbuffer);
|
|
$buffer.=$mbuffer;
|
|
}
|
|
|
|
}
|
|
$newParts[]=$buffer;
|
|
return $newParts;
|
|
}
|
|
|
|
/**
|
|
* Splitting content into blocks *recursively* and processing tags/content with call back functions.
|
|
*
|
|
* @param string Tag list, see splitIntoBlock()
|
|
* @param string Content, see splitIntoBlock()
|
|
* @param object Object where call back methods are.
|
|
* @param string Name of call back method for content; "function callBackContent($str,$level)"
|
|
* @param string Name of call back method for tags; "function callBackTags($tags,$level)"
|
|
* @param integer Indent level
|
|
* @return string Processed content
|
|
* @see splitIntoBlock()
|
|
*/
|
|
function splitIntoBlockRecursiveProc($tag,$content,&$procObj,$callBackContent,$callBackTags,$level=0) {
|
|
$parts = $this->splitIntoBlock($tag,$content,TRUE);
|
|
foreach($parts as $k => $v) {
|
|
if ($k%2) {
|
|
$firstTagName = $this->getFirstTagName($v, TRUE);
|
|
$tagsArray = array();
|
|
$tagsArray['tag_start'] = $this->getFirstTag($v);
|
|
$tagsArray['tag_end'] = '</'.$firstTagName.'>';
|
|
$tagsArray['tag_name'] = strtolower($firstTagName);
|
|
$tagsArray['add_level'] = 1;
|
|
$tagsArray['content'] = $this->splitIntoBlockRecursiveProc($tag,$this->removeFirstAndLastTag($v),$procObj,$callBackContent,$callBackTags,$level+$tagsArray['add_level']);
|
|
|
|
if ($callBackTags) $tagsArray = $procObj->$callBackTags($tagsArray,$level);
|
|
|
|
$parts[$k] =
|
|
$tagsArray['tag_start'].
|
|
$tagsArray['content'].
|
|
$tagsArray['tag_end'];
|
|
} else {
|
|
if ($callBackContent) $parts[$k] = $procObj->$callBackContent($parts[$k],$level);
|
|
}
|
|
}
|
|
|
|
return implode('',$parts);
|
|
}
|
|
|
|
/**
|
|
* Returns an array with the $content divided by tag-blocks specified with the list of tags, $tag
|
|
* Even numbers in the array are outside the blocks, Odd numbers are block-content.
|
|
* Use ->getAllParts() and ->removeFirstAndLastTag() to process the content if needed.
|
|
*
|
|
* @param string List of tags
|
|
* @param string HTML-content
|
|
* @return array Even numbers in the array are outside the blocks, Odd numbers are block-content.
|
|
* @see splitIntoBlock(), getAllParts(), removeFirstAndLastTag()
|
|
*/
|
|
function splitTags($tag,$content) {
|
|
$tags = t3lib_div::trimExplode(',',$tag,1);
|
|
$regexStr = '/\<('.implode('|', $tags).')(\s[^>]*)?\/?>/si';
|
|
$parts = preg_split($regexStr, $content);
|
|
|
|
$pointer = strlen($parts[0]);
|
|
$newParts = array();
|
|
$newParts[] = $parts[0];
|
|
reset($parts);
|
|
next($parts);
|
|
while(list($k,$v)=each($parts)) {
|
|
$tagLen = strcspn(substr($content,$pointer),'>')+1;
|
|
|
|
// Set tag:
|
|
$tag = substr($content,$pointer,$tagLen); // New buffer set and pointer increased
|
|
$newParts[] = $tag;
|
|
$pointer+= strlen($tag);
|
|
|
|
// Set content:
|
|
$newParts[] = $v;
|
|
$pointer+= strlen($v);
|
|
}
|
|
return $newParts;
|
|
}
|
|
|
|
/**
|
|
* Returns an array with either tag or non-tag content of the result from ->splitIntoBlock()/->splitTags()
|
|
*
|
|
* @param array Parts generated by ->splitIntoBlock() or >splitTags()
|
|
* @param boolean Whether to return the tag-parts (default,true) or what was outside the tags.
|
|
* @param boolean Whether to include the tags in the tag-parts (most useful for input made by ->splitIntoBlock())
|
|
* @return array Tag-parts/Non-tag-parts depending on input argument settings
|
|
* @see splitIntoBlock(), splitTags()
|
|
*/
|
|
function getAllParts($parts,$tag_parts=1,$include_tag=1) {
|
|
$newParts=array();
|
|
foreach ($parts as $k => $v) {
|
|
if (($k+($tag_parts?0:1))%2) {
|
|
if (!$include_tag) $v=$this->removeFirstAndLastTag($v);
|
|
$newParts[]=$v;
|
|
}
|
|
}
|
|
return $newParts;
|
|
}
|
|
|
|
/**
|
|
* Removes the first and last tag in the string
|
|
* Anything before the first and after the last tags respectively is also removed
|
|
*
|
|
* @param string String to process
|
|
* @return string
|
|
*/
|
|
function removeFirstAndLastTag($str) {
|
|
// End of first tag:
|
|
$start = strpos($str,'>');
|
|
// Begin of last tag:
|
|
$end = strrpos($str,'<');
|
|
// return
|
|
return substr($str, $start+1, $end-$start-1);
|
|
}
|
|
|
|
/**
|
|
* Returns the first tag in $str
|
|
* Actually everything from the begining of the $str is returned, so you better make sure the tag is the first thing...
|
|
*
|
|
* @param string HTML string with tags
|
|
* @return string
|
|
*/
|
|
function getFirstTag($str) {
|
|
// First:
|
|
$endLen = strpos($str,'>')+1;
|
|
return substr($str,0,$endLen);
|
|
}
|
|
|
|
/**
|
|
* Returns the NAME of the first tag in $str
|
|
*
|
|
* @param string HTML tag (The element name MUST be separated from the attributes by a space character! Just *whitespace* will not do)
|
|
* @param boolean If set, then the tag is NOT converted to uppercase by case is preserved.
|
|
* @return string Tag name in upper case
|
|
* @see getFirstTag()
|
|
*/
|
|
function getFirstTagName($str,$preserveCase=FALSE) {
|
|
$matches = array();
|
|
if (preg_match('/^\s*\<([^\s\>]+)(\s|\>)/', $str, $matches)===1) {
|
|
if (!$preserveCase) {
|
|
return strtoupper($matches[1]);
|
|
}
|
|
return $matches[1];
|
|
}
|
|
return '';
|
|
}
|
|
|
|
/**
|
|
* Returns an array with all attributes as keys. Attributes are only lowercase a-z
|
|
* If a attribute is empty (shorthand), then the value for the key is empty. You can check if it existed with isset()
|
|
*
|
|
* @param string Tag: $tag is either a whole tag (eg '<TAG OPTION ATTRIB=VALUE>') or the parameterlist (ex ' OPTION ATTRIB=VALUE>')
|
|
* @param boolean If set, the attribute values are de-htmlspecialchar'ed. Should actually always be set!
|
|
* @return array array(Tag attributes,Attribute meta-data)
|
|
*/
|
|
function get_tag_attributes($tag,$deHSC=0) {
|
|
list($components,$metaC) = $this->split_tag_attributes($tag);
|
|
$name = ''; // attribute name is stored here
|
|
$valuemode = false;
|
|
$attributes = array();
|
|
$attributesMeta = array();
|
|
if (is_array($components)) {
|
|
foreach ($components as $key => $val) {
|
|
if ($val != '=') { // Only if $name is set (if there is an attribute, that waits for a value), that valuemode is enabled. This ensures that the attribute is assigned it's value
|
|
if ($valuemode) {
|
|
if ($name) {
|
|
$attributes[$name] = $deHSC?t3lib_div::htmlspecialchars_decode($val):$val;
|
|
$attributesMeta[$name]['dashType']=$metaC[$key];
|
|
$name = '';
|
|
}
|
|
} else {
|
|
if ($namekey = preg_replace('/[^[:alnum:]_\:\-]/','',$val)) {
|
|
$name = strtolower($namekey);
|
|
$attributesMeta[$name]=array();
|
|
$attributesMeta[$name]['origTag']=$namekey;
|
|
$attributes[$name] = '';
|
|
}
|
|
}
|
|
$valuemode = false;
|
|
} else {
|
|
$valuemode = true;
|
|
}
|
|
}
|
|
return array($attributes,$attributesMeta);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns an array with the 'components' from an attribute list. The result is normally analyzed by get_tag_attributes
|
|
* Removes tag-name if found
|
|
*
|
|
* @param string The tag or attributes
|
|
* @return array
|
|
* @access private
|
|
* @see t3lib_div::split_tag_attributes()
|
|
*/
|
|
function split_tag_attributes($tag) {
|
|
$matches = array();
|
|
if (preg_match('/(\<[^\s]+\s+)?(.*?)\s*(\>)?$/s', $tag, $matches)!==1) {
|
|
return array(array(), array());
|
|
}
|
|
$tag_tmp = $matches[2];
|
|
|
|
$metaValue = array();
|
|
$value = array();
|
|
$matches = array();
|
|
if (preg_match_all('/("[^"]*"|\'[^\']*\'|[^\s"\'\=]+|\=)/s', $tag_tmp, $matches)>0) {
|
|
foreach ($matches[1] as $part) {
|
|
$firstChar = substr($part, 0, 1);
|
|
if ($firstChar=='"' || $firstChar=="'") {
|
|
$metaValue[] = $firstChar;
|
|
$value[] = substr($part, 1, -1);
|
|
} else {
|
|
$metaValue[] = '';
|
|
$value[] = $part;
|
|
}
|
|
}
|
|
}
|
|
return array($value,$metaValue);
|
|
}
|
|
|
|
/**
|
|
* Checks whether block/solo tags are found in the correct amounts in HTML content
|
|
* Block tags are tags which are required to have an equal amount of start and end tags, eg. "<table>...</table>"
|
|
* Solo tags are tags which are required to have ONLY start tags (possibly with an XHTML ending like ".../>")
|
|
* NOTICE: Correct XHTML might actually fail since "<br></br>" is allowed as well as "<br/>". However only the LATTER is accepted by this function (with "br" in the "solo-tag" list), the first example will result in a warning.
|
|
* NOTICE: Correct XHTML might actually fail since "<p/>" is allowed as well as "<p></p>". However only the LATTER is accepted by this function (with "p" in the "block-tag" list), the first example will result in an ERROR!
|
|
* NOTICE: Correct HTML version "something" allows eg. <p> and <li> to be NON-ended (implicitly ended by other tags). However this is NOT accepted by this function (with "p" and "li" in the block-tag list) and it will result in an ERROR!
|
|
*
|
|
* @param string HTML content to analyze
|
|
* @param string Tag names for block tags (eg. table or div or p) in lowercase, commalist (eg. "table,div,p")
|
|
* @param string Tag names for solo tags (eg. img, br or input) in lowercase, commalist ("img,br,input")
|
|
* @return array Analyse data.
|
|
*/
|
|
function checkTagTypeCounts($content,$blockTags='a,b,blockquote,body,div,em,font,form,h1,h2,h3,h4,h5,h6,i,li,map,ol,option,p,pre,select,span,strong,table,td,textarea,tr,u,ul', $soloTags='br,hr,img,input,area') {
|
|
$content = strtolower($content);
|
|
$analyzedOutput=array();
|
|
$analyzedOutput['counts']=array(); // Counts appearances of start-tags
|
|
$analyzedOutput['errors']=array(); // Lists ERRORS
|
|
$analyzedOutput['warnings']=array(); // Lists warnings.
|
|
$analyzedOutput['blocks']=array(); // Lists stats for block-tags
|
|
$analyzedOutput['solo']=array(); // Lists stats for solo-tags
|
|
|
|
// Block tags, must have endings...
|
|
$blockTags = explode(',',$blockTags);
|
|
foreach($blockTags as $tagName) {
|
|
$countBegin = count(preg_split('/\<'.$tagName.'(\s|\>)/s',$content))-1;
|
|
$countEnd = count(preg_split('/\<\/'.$tagName.'(\s|\>)/s',$content))-1;
|
|
$analyzedOutput['blocks'][$tagName]=array($countBegin,$countEnd,$countBegin-$countEnd);
|
|
if ($countBegin) $analyzedOutput['counts'][$tagName]=$countBegin;
|
|
if ($countBegin-$countEnd) {
|
|
if ($countBegin-$countEnd > 0) {
|
|
$analyzedOutput['errors'][$tagName]='There were more start-tags ('.$countBegin.') than end-tags ('.$countEnd.') for the element "'.$tagName.'". There should be an equal amount!';
|
|
} else {
|
|
$analyzedOutput['warnings'][$tagName]='There were more end-tags ('.$countEnd.') than start-tags ('.$countBegin.') for the element "'.$tagName.'". There should be an equal amount! However the problem is not fatal.';
|
|
}
|
|
}
|
|
}
|
|
|
|
// Solo tags, must NOT have endings...
|
|
$soloTags = explode(',',$soloTags);
|
|
foreach($soloTags as $tagName) {
|
|
$countBegin = count(preg_split('/\<'.$tagName.'(\s|\>)/s',$content))-1;
|
|
$countEnd = count(preg_split('/\<\/'.$tagName.'(\s|\>)/s',$content))-1;
|
|
$analyzedOutput['solo'][$tagName]=array($countBegin,$countEnd);
|
|
if ($countBegin) $analyzedOutput['counts'][$tagName]=$countBegin;
|
|
if ($countEnd) {
|
|
$analyzedOutput['warnings'][$tagName]='There were end-tags found ('.$countEnd.') for the element "'.$tagName.'". This was not expected (although XHTML technically allows it).';
|
|
}
|
|
}
|
|
|
|
return $analyzedOutput;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*********************************
|
|
*
|
|
* Clean HTML code
|
|
*
|
|
*********************************/
|
|
|
|
/**
|
|
* Function that can clean up HTML content according to configuration given in the $tags array.
|
|
*
|
|
* Initializing the $tags array to allow a list of tags (in this case <B>,<I>,<U> and <A>), set it like this: $tags = array_flip(explode(',','b,a,i,u'))
|
|
* If the value of the $tags[$tagname] entry is an array, advanced processing of the tags is initialized. These are the options:
|
|
*
|
|
* $tags[$tagname] = Array(
|
|
* 'overrideAttribs' => '' If set, this string is preset as the attributes of the tag
|
|
* 'allowedAttribs' => '0' (zero) = no attributes allowed, '[commalist of attributes]' = only allowed attributes. If blank, all attributes are allowed.
|
|
* 'fixAttrib' => Array(
|
|
* '[attribute name]' => Array (
|
|
* 'set' => Force the attribute value to this value.
|
|
* 'unset' => Boolean: If set, the attribute is unset.
|
|
* 'default' => If no attribute exists by this name, this value is set as default value (if this value is not blank)
|
|
* 'always' => Boolean. If set, the attribute is always processed. Normally an attribute is processed only if it exists
|
|
* 'trim,intval,lower,upper' => All booleans. If any of these keys are set, the value is passed through the respective PHP-functions.
|
|
* 'range' => Array ('[low limit]','[high limit, optional]') Setting integer range.
|
|
* 'list' => Array ('[value1/default]','[value2]','[value3]') Attribute must be in this list. If not, the value is set to the first element.
|
|
* 'removeIfFalse' => Boolean/'blank'. If set, then the attribute is removed if it is 'false'. If this value is set to 'blank' then the value must be a blank string (that means a 'zero' value will not be removed)
|
|
* 'removeIfEquals' => [value] If the attribute value matches the value set here, then it is removed.
|
|
* 'casesensitiveComp' => 1 If set, then the removeIfEquals and list comparisons will be case sensitive. Otherwise not.
|
|
* )
|
|
* ),
|
|
* 'protect' => '', Boolean. If set, the tag <> is converted to < and >
|
|
* 'remap' => '', String. If set, the tagname is remapped to this tagname
|
|
* 'rmTagIfNoAttrib' => '', Boolean. If set, then the tag is removed if no attributes happend to be there.
|
|
* 'nesting' => '', Boolean/'global'. If set true, then this tag must have starting and ending tags in the correct order. Any tags not in this order will be discarded. Thus '</B><B><I></B></I></B>' will be converted to '<B><I></B></I>'. Is the value 'global' then true nesting in relation to other tags marked for 'global' nesting control is preserved. This means that if <B> and <I> are set for global nesting then this string '</B><B><I></B></I></B>' is converted to '<B></B>'
|
|
* )
|
|
*
|
|
* @param string $content; is the HTML-content being processed. This is also the result being returned.
|
|
* @param array $tags; is an array where each key is a tagname in lowercase. Only tags present as keys in this array are preserved. The value of the key can be an array with a vast number of options to configure.
|
|
* @param string $keepAll; boolean/'protect', if set, then all tags are kept regardless of tags present as keys in $tags-array. If 'protect' then the preserved tags have their <> converted to < and >
|
|
* @param integer $hSC; Values -1,0,1,2: Set to zero= disabled, set to 1 then the content BETWEEN tags is htmlspecialchar()'ed, set to -1 its the opposite and set to 2 the content will be HSC'ed BUT with preservation for real entities (eg. "&" or "ê")
|
|
* @param array Configuration array send along as $conf to the internal functions ->processContent() and ->processTag()
|
|
* @return string Processed HTML content
|
|
*/
|
|
function HTMLcleaner($content, $tags=array(),$keepAll=0,$hSC=0,$addConfig=array()) {
|
|
$newContent = array();
|
|
$tokArr = explode('<',$content);
|
|
$newContent[] = $this->processContent(current($tokArr),$hSC,$addConfig);
|
|
next($tokArr);
|
|
|
|
$c = 1;
|
|
$tagRegister = array();
|
|
$tagStack = array();
|
|
while(list(,$tok)=each($tokArr)) {
|
|
/** BEGIN BUG FIX 28.03.2006 **
|
|
$firstChar = substr($tok,0,1);
|
|
# if (strcmp(trim($firstChar),'')) { // It is a tag...
|
|
if (preg_match('/[[:alnum:]\/]/',$firstChar)==1) { // It is a tag... (first char is a-z0-9 or /) (fixed 19/01 2004). This also avoids triggering on <?xml..> and <!DOCTYPE..>
|
|
$tagEnd = strpos($tok,'>');
|
|
if ($tagEnd) { // If there is and end-bracket... tagEnd can't be 0 as the first character can't be a >
|
|
$endTag = $firstChar=='/' ? 1 : 0;
|
|
*/
|
|
preg_match('/([^ >]+)/',$tok,$a_match); // get tag for checking: next char after tag name must be > or whitespace
|
|
$tagName = $a_match[1];
|
|
if (preg_match('/^[[:alnum:]\/]+$/',$tagName)==1) { // It is a tag... (contains only chars a-z0-9 or /) (fixed 28/03 2006). This also avoids triggering on <?xml..> and <!DOCTYPE..>
|
|
$tagEnd = strpos($tok,'>');
|
|
if ($tagEnd) { // If there is and end-bracket... tagEnd can't be 0 as the first character can't be a >
|
|
$endTag = (substr($tok,0,1)=='/') ? 1 : 0;
|
|
/** END BUG FIX 28.03.2006 **/
|
|
$tagContent = substr($tok,$endTag,$tagEnd-$endTag);
|
|
$tagParts = preg_split('/\s+/s',$tagContent,2);
|
|
$tagName = strtolower($tagParts[0]);
|
|
if (isset($tags[$tagName])) {
|
|
if (is_array($tags[$tagName])) { // If there is processing to do for the tag:
|
|
|
|
if (!$endTag) { // If NOT an endtag, do attribute processing (added dec. 2003)
|
|
// Override attributes
|
|
if (strcmp($tags[$tagName]['overrideAttribs'],'')) {
|
|
$tagParts[1]=$tags[$tagName]['overrideAttribs'];
|
|
}
|
|
|
|
// Allowed tags
|
|
if (strcmp($tags[$tagName]['allowedAttribs'],'')) {
|
|
if (!strcmp($tags[$tagName]['allowedAttribs'],'0')) { // No attribs allowed
|
|
$tagParts[1]='';
|
|
} elseif (trim($tagParts[1])) {
|
|
$tagAttrib = $this->get_tag_attributes($tagParts[1]);
|
|
$tagParts[1]='';
|
|
$newTagAttrib = array();
|
|
if (!($tList = $tags[$tagName]['_allowedAttribs'])) {
|
|
// Just explode attribts for tag once
|
|
$tList = $tags[$tagName]['_allowedAttribs'] = t3lib_div::trimExplode(',',strtolower($tags[$tagName]['allowedAttribs']),1);
|
|
}
|
|
foreach ($tList as $allowTag) {
|
|
if (isset($tagAttrib[0][$allowTag])) $newTagAttrib[$allowTag]=$tagAttrib[0][$allowTag];
|
|
}
|
|
$tagParts[1]=$this->compileTagAttribs($newTagAttrib,$tagAttrib[1]);
|
|
}
|
|
}
|
|
|
|
// Fixed attrib values
|
|
if (is_array($tags[$tagName]['fixAttrib'])) {
|
|
$tagAttrib = $this->get_tag_attributes($tagParts[1]);
|
|
$tagParts[1]='';
|
|
reset($tags[$tagName]['fixAttrib']);
|
|
while(list($attr,$params)=each($tags[$tagName]['fixAttrib'])) {
|
|
if (strlen($params['set'])) $tagAttrib[0][$attr] = $params['set'];
|
|
if (strlen($params['unset'])) unset($tagAttrib[0][$attr]);
|
|
if (strcmp($params['default'],'') && !isset($tagAttrib[0][$attr])) $tagAttrib[0][$attr]=$params['default'];
|
|
if ($params['always'] || isset($tagAttrib[0][$attr])) {
|
|
if ($params['trim']) {$tagAttrib[0][$attr]=trim($tagAttrib[0][$attr]);}
|
|
if ($params['intval']) {$tagAttrib[0][$attr]=intval($tagAttrib[0][$attr]);}
|
|
if ($params['lower']) {$tagAttrib[0][$attr]=strtolower($tagAttrib[0][$attr]);}
|
|
if ($params['upper']) {$tagAttrib[0][$attr]=strtoupper($tagAttrib[0][$attr]);}
|
|
if ($params['range']) {
|
|
if (isset($params['range'][1])) {
|
|
$tagAttrib[0][$attr]=t3lib_div::intInRange($tagAttrib[0][$attr],intval($params['range'][0]),intval($params['range'][1]));
|
|
} else {
|
|
$tagAttrib[0][$attr]=t3lib_div::intInRange($tagAttrib[0][$attr],intval($params['range'][0]));
|
|
}
|
|
}
|
|
if (is_array($params['list'])) {
|
|
if (!in_array($this->caseShift($tagAttrib[0][$attr],$params['casesensitiveComp']),$this->caseShift($params['list'],$params['casesensitiveComp'],$tagName))) $tagAttrib[0][$attr]=$params['list'][0];
|
|
}
|
|
if (($params['removeIfFalse'] && $params['removeIfFalse']!='blank' && !$tagAttrib[0][$attr]) || ($params['removeIfFalse']=='blank' && !strcmp($tagAttrib[0][$attr],''))) {
|
|
unset($tagAttrib[0][$attr]);
|
|
}
|
|
if (strcmp($params['removeIfEquals'],'') && !strcmp($this->caseShift($tagAttrib[0][$attr],$params['casesensitiveComp']),$this->caseShift($params['removeIfEquals'],$params['casesensitiveComp']))) {
|
|
unset($tagAttrib[0][$attr]);
|
|
}
|
|
if ($params['prefixLocalAnchors']) {
|
|
if (substr($tagAttrib[0][$attr],0,1)=='#') {
|
|
$prefix = t3lib_div::getIndpEnv('TYPO3_REQUEST_URL');
|
|
$tagAttrib[0][$attr] = $prefix.$tagAttrib[0][$attr];
|
|
if ($params['prefixLocalAnchors']==2 && t3lib_div::isFirstPartOfStr($prefix,t3lib_div::getIndpEnv('TYPO3_SITE_URL'))) {
|
|
$tagAttrib[0][$attr] = substr($tagAttrib[0][$attr],strlen(t3lib_div::getIndpEnv('TYPO3_SITE_URL')));
|
|
}
|
|
}
|
|
}
|
|
if ($params['prefixRelPathWith']) {
|
|
$urlParts = parse_url($tagAttrib[0][$attr]);
|
|
if (!$urlParts['scheme'] && substr($urlParts['path'],0,1)!='/') { // If it is NOT an absolute URL (by http: or starting "/")
|
|
$tagAttrib[0][$attr] = $params['prefixRelPathWith'].$tagAttrib[0][$attr];
|
|
}
|
|
}
|
|
if ($params['userFunc']) {
|
|
$tagAttrib[0][$attr] = t3lib_div::callUserFunction($params['userFunc'],$tagAttrib[0][$attr],$this);
|
|
}
|
|
}
|
|
}
|
|
$tagParts[1]=$this->compileTagAttribs($tagAttrib[0],$tagAttrib[1]);
|
|
}
|
|
} else { // If endTag, remove any possible attributes:
|
|
$tagParts[1]='';
|
|
}
|
|
|
|
// Protecting the tag by converting < and > to < and > ??
|
|
if ($tags[$tagName]['protect']) {
|
|
$lt = '<'; $gt = '>';
|
|
} else {
|
|
$lt = '<'; $gt = '>';
|
|
}
|
|
// Remapping tag name?
|
|
if ($tags[$tagName]['remap']) $tagParts[0] = $tags[$tagName]['remap'];
|
|
|
|
// rmTagIfNoAttrib
|
|
if ($endTag || trim($tagParts[1]) || !$tags[$tagName]['rmTagIfNoAttrib']) {
|
|
$setTag=1;
|
|
|
|
if ($tags[$tagName]['nesting']) {
|
|
if (!is_array($tagRegister[$tagName])) $tagRegister[$tagName]=array();
|
|
|
|
if ($endTag) {
|
|
/* if ($tags[$tagName]['nesting']=='global') {
|
|
$lastEl = end($tagStack);
|
|
$correctTag = !strcmp($tagName,$lastEl);
|
|
} else $correctTag=1;
|
|
*/
|
|
$correctTag=1;
|
|
if ($tags[$tagName]['nesting']=='global') {
|
|
$lastEl = end($tagStack);
|
|
if (strcmp($tagName,$lastEl)) {
|
|
if (in_array($tagName,$tagStack)) {
|
|
while(count($tagStack) && strcmp($tagName,$lastEl)) {
|
|
$elPos = end($tagRegister[$lastEl]);
|
|
unset($newContent[$elPos]);
|
|
|
|
array_pop($tagRegister[$lastEl]);
|
|
array_pop($tagStack);
|
|
$lastEl = end($tagStack);
|
|
}
|
|
} else {
|
|
$correctTag=0; // In this case the
|
|
}
|
|
}
|
|
}
|
|
if (!count($tagRegister[$tagName]) || !$correctTag) {
|
|
$setTag=0;
|
|
} else {
|
|
array_pop($tagRegister[$tagName]);
|
|
if ($tags[$tagName]['nesting']=='global') {array_pop($tagStack);}
|
|
}
|
|
} else {
|
|
array_push($tagRegister[$tagName],$c);
|
|
if ($tags[$tagName]['nesting']=='global') {array_push($tagStack,$tagName);}
|
|
}
|
|
}
|
|
|
|
if ($setTag) {
|
|
// Setting the tag
|
|
$newContent[$c++]=$this->processTag($lt.($endTag?'/':'').trim($tagParts[0].' '.$tagParts[1]).$gt,$addConfig,$endTag,$lt=='<');
|
|
}
|
|
}
|
|
} else {
|
|
$newContent[$c++]=$this->processTag('<'.($endTag?'/':'').$tagContent.'>',$addConfig,$endTag);
|
|
}
|
|
} elseif ($keepAll) { // This is if the tag was not defined in the array for processing:
|
|
if (!strcmp($keepAll,'protect')) {
|
|
$lt = '<'; $gt = '>';
|
|
} else {
|
|
$lt = '<'; $gt = '>';
|
|
}
|
|
$newContent[$c++]=$this->processTag($lt.($endTag?'/':'').$tagContent.$gt,$addConfig,$endTag,$lt=='<');
|
|
}
|
|
$newContent[$c++]=$this->processContent(substr($tok,$tagEnd+1),$hSC,$addConfig);
|
|
} else {
|
|
$newContent[$c++]=$this->processContent('<'.$tok,$hSC,$addConfig); // There were not end-bracket, so no tag...
|
|
}
|
|
} else {
|
|
$newContent[$c++]=$this->processContent('<'.$tok,$hSC,$addConfig); // It was not a tag anyways
|
|
}
|
|
}
|
|
|
|
// Unsetting tags:
|
|
foreach ($tagRegister as $tag => $positions) {
|
|
foreach ($positions as $pKey) {
|
|
unset($newContent[$pKey]);
|
|
}
|
|
}
|
|
|
|
return implode('',$newContent);
|
|
}
|
|
|
|
/**
|
|
* Converts htmlspecialchars forth ($dir=1) AND back ($dir=-1)
|
|
*
|
|
* @param string Input value
|
|
* @param integer Direction: forth ($dir=1, dir=2 for preserving entities) AND back ($dir=-1)
|
|
* @return string Output value
|
|
*/
|
|
function bidir_htmlspecialchars($value,$dir) {
|
|
if ($dir==1) {
|
|
$value = htmlspecialchars($value);
|
|
} elseif ($dir==2) {
|
|
$value = t3lib_div::deHSCentities(htmlspecialchars($value));
|
|
} elseif ($dir==-1) {
|
|
$value = str_replace('>','>',$value);
|
|
$value = str_replace('<','<',$value);
|
|
$value = str_replace('"','"',$value);
|
|
$value = str_replace('&','&',$value);
|
|
}
|
|
return $value;
|
|
}
|
|
|
|
/**
|
|
* Prefixes the relative paths of hrefs/src/action in the tags [td,table,body,img,input,form,link,script,a] in the $content with the $main_prefix or and alternative given by $alternatives
|
|
*
|
|
* @param string Prefix string
|
|
* @param string HTML content
|
|
* @param array Array with alternative prefixes for certain of the tags. key=>value pairs where the keys are the tag element names in uppercase
|
|
* @param string Suffix string (put after the resource).
|
|
* @return string Processed HTML content
|
|
*/
|
|
function prefixResourcePath($main_prefix,$content,$alternatives=array(),$suffix='') {
|
|
|
|
$parts = $this->splitTags('embed,td,table,body,img,input,form,link,script,a',$content);
|
|
foreach ($parts as $k => $v) {
|
|
if ($k%2) {
|
|
$params = $this->get_tag_attributes($v,1);
|
|
$tagEnd = substr($v,-2)=='/>' ? ' />' : '>'; // Detect tag-ending so that it is re-applied correctly.
|
|
$firstTagName = $this->getFirstTagName($v); // The 'name' of the first tag
|
|
$somethingDone=0;
|
|
$prefix = isset($alternatives[strtoupper($firstTagName)]) ? $alternatives[strtoupper($firstTagName)] : $main_prefix;
|
|
switch(strtolower($firstTagName)) {
|
|
// background - attribute:
|
|
case 'td':
|
|
case 'body':
|
|
case 'table':
|
|
$src = $params[0]['background'];
|
|
if ($src) {
|
|
$params[0]['background'] = $this->prefixRelPath($prefix,$params[0]['background'],$suffix);
|
|
$somethingDone=1;
|
|
}
|
|
break;
|
|
// src attribute
|
|
case 'img':
|
|
case 'input':
|
|
case 'script':
|
|
case 'embed':
|
|
$src = $params[0]['src'];
|
|
if ($src) {
|
|
$params[0]['src'] = $this->prefixRelPath($prefix,$params[0]['src'],$suffix);
|
|
$somethingDone=1;
|
|
}
|
|
break;
|
|
case 'link':
|
|
case 'a':
|
|
$src = $params[0]['href'];
|
|
if ($src) {
|
|
$params[0]['href'] = $this->prefixRelPath($prefix,$params[0]['href'],$suffix);
|
|
$somethingDone=1;
|
|
}
|
|
break;
|
|
// action attribute
|
|
case 'form':
|
|
$src = $params[0]['action'];
|
|
if ($src) {
|
|
$params[0]['action'] = $this->prefixRelPath($prefix,$params[0]['action'],$suffix);
|
|
$somethingDone=1;
|
|
}
|
|
break;
|
|
}
|
|
if ($somethingDone) {
|
|
$tagParts = preg_split('/\s+/s',$v,2);
|
|
$tagParts[1]=$this->compileTagAttribs($params[0],$params[1]);
|
|
$parts[$k] = '<'.trim(strtolower($firstTagName).' '.$tagParts[1]).$tagEnd;
|
|
}
|
|
}
|
|
}
|
|
$content = implode('',$parts);
|
|
|
|
// Fix <style> section:
|
|
$prefix = isset($alternatives['style']) ? $alternatives['style'] : $main_prefix;
|
|
if (strlen($prefix)) {
|
|
$parts = $this->splitIntoBlock('style',$content);
|
|
foreach($parts as $k => $v) {
|
|
if ($k%2) {
|
|
$parts[$k] = eregi_replace('(url[[:space:]]*\([[:space:]]*["\']?)([^"\')]*)(["\']?[[:space:]]*\))','\1'.$prefix.'\2'.$suffix.'\3',$parts[$k]);
|
|
}
|
|
}
|
|
$content = implode('',$parts);
|
|
}
|
|
|
|
return $content;
|
|
}
|
|
|
|
/**
|
|
* Internal sub-function for ->prefixResourcePath()
|
|
*
|
|
* @param string Prefix string
|
|
* @param string Relative path/URL
|
|
* @param string Suffix string
|
|
* @return string Output path, prefixed if no scheme in input string
|
|
* @access private
|
|
*/
|
|
function prefixRelPath($prefix,$srcVal,$suffix='') {
|
|
$pU = parse_url($srcVal);
|
|
if (!$pU['scheme'] && substr($srcVal, 0, 1)!='/') { // If not an absolute URL.
|
|
$srcVal = $prefix.$srcVal.$suffix;
|
|
}
|
|
return $srcVal;
|
|
}
|
|
|
|
/**
|
|
* Cleans up the input $value for fonttags.
|
|
* If keepFace,-Size and -Color is set then font-tags with an allowed property is kept. Else deleted.
|
|
*
|
|
* @param string HTML content with font-tags inside to clean up.
|
|
* @param boolean If set, keep "face" attribute
|
|
* @param boolean If set, keep "size" attribute
|
|
* @param boolean If set, keep "color" attribute
|
|
* @return string Processed HTML content
|
|
*/
|
|
function cleanFontTags($value,$keepFace=0,$keepSize=0,$keepColor=0) {
|
|
$fontSplit = $this->splitIntoBlock('font',$value); // ,1 ?? - could probably be more stable if splitTags() was used since this depends on end-tags being properly set!
|
|
foreach ($fontSplit as $k => $v) {
|
|
if ($k%2) { // font:
|
|
$attribArray=$this->get_tag_attributes_classic($this->getFirstTag($v));
|
|
$newAttribs=array();
|
|
if ($keepFace && $attribArray['face']) $newAttribs[]='face="'.$attribArray['face'].'"';
|
|
if ($keepSize && $attribArray['size']) $newAttribs[]='size="'.$attribArray['size'].'"';
|
|
if ($keepColor && $attribArray['color']) $newAttribs[]='color="'.$attribArray['color'].'"';
|
|
|
|
$innerContent = $this->cleanFontTags($this->removeFirstAndLastTag($v),$keepFace,$keepSize,$keepColor);
|
|
if (count($newAttribs)) {
|
|
$fontSplit[$k]='<font '.implode(' ',$newAttribs).'>'.$innerContent.'</font>';
|
|
} else {
|
|
$fontSplit[$k]=$innerContent;
|
|
}
|
|
}
|
|
}
|
|
return implode('',$fontSplit);
|
|
}
|
|
|
|
/**
|
|
* This is used to map certain tag-names into other names.
|
|
*
|
|
* @param string HTML content
|
|
* @param array Array with tag key=>value pairs where key is from-tag and value is to-tag
|
|
* @param string Alternative less-than char to search for (search regex string)
|
|
* @param string Alternative less-than char to replace with (replace regex string)
|
|
* @return string Processed HTML content
|
|
*/
|
|
function mapTags($value,$tags=array(),$ltChar='<',$ltChar2='<') {
|
|
|
|
foreach($tags as $from => $to) {
|
|
$value = preg_replace('/'.preg_quote($ltChar).'(\/)?'.$from.'\s([^\>])*(\/)?\>/', $ltChar2.'$1'.$to.' $2$3>', $value);
|
|
}
|
|
return $value;
|
|
}
|
|
|
|
/**
|
|
* This converts htmlspecialchar()'ed tags (from $tagList) back to real tags. Eg. '<strong>' would be converted back to '<strong>' if found in $tagList
|
|
*
|
|
* @param string HTML content
|
|
* @param string Tag list, separated by comma. Lowercase!
|
|
* @return string Processed HTML content
|
|
*/
|
|
function unprotectTags($content,$tagList='') {
|
|
$tagsArray = t3lib_div::trimExplode(',',$tagList,1);
|
|
$contentParts = explode('<',$content);
|
|
next($contentParts); // bypass the first
|
|
while(list($k,$tok)=each($contentParts)) {
|
|
$firstChar = substr($tok,0,1);
|
|
if (strcmp(trim($firstChar),'')) {
|
|
$subparts = explode('>',$tok,2);
|
|
$tagEnd = strlen($subparts[0]);
|
|
if (strlen($tok)!=$tagEnd) {
|
|
$endTag = $firstChar=='/' ? 1 : 0;
|
|
$tagContent = substr($tok,$endTag,$tagEnd-$endTag);
|
|
$tagParts = preg_split('/\s+/s',$tagContent,2);
|
|
$tagName = strtolower($tagParts[0]);
|
|
if (!strcmp($tagList,'') || in_array($tagName,$tagsArray)) {
|
|
$contentParts[$k] = '<'.$subparts[0].'>'.$subparts[1];
|
|
} else $contentParts[$k] = '<'.$tok;
|
|
} else $contentParts[$k] = '<'.$tok;
|
|
} else $contentParts[$k] = '<'.$tok;
|
|
}
|
|
|
|
return implode('',$contentParts);
|
|
}
|
|
|
|
/**
|
|
* Strips tags except the tags in the list, $tagList
|
|
* OBSOLETE - use PHP function strip_tags()
|
|
*
|
|
* @param string Value to process
|
|
* @param string List of tags
|
|
* @return string Output value
|
|
* @ignore
|
|
*/
|
|
function stripTagsExcept($value,$tagList) {
|
|
$tags=t3lib_div::trimExplode(',',$tagList,1);
|
|
$forthArr=array();
|
|
$backArr=array();
|
|
foreach ($tags as $theTag) {
|
|
$forthArr[$theTag]=md5($theTag);
|
|
$backArr[md5($theTag)]=$theTag;
|
|
}
|
|
$value = $this->mapTags($value,$forthArr,'<','_');
|
|
$value=strip_tags($value);
|
|
$value = $this->mapTags($value,$backArr,'_','<');
|
|
return $value;
|
|
}
|
|
|
|
/**
|
|
* Internal function for case shifting of a string or whole array
|
|
*
|
|
* @param mixed Input string/array
|
|
* @param boolean If $str is a string AND this boolean(caseSensitive) is false, the string is returned in uppercase
|
|
* @param string Key string used for internal caching of the results. Could be an MD5 hash of the serialized version of the input $str if that is an array.
|
|
* @return string Output string, processed
|
|
* @access private
|
|
*/
|
|
function caseShift($str,$flag,$cacheKey='') {
|
|
$cacheKey .= $flag?1:0;
|
|
if (is_array($str)) {
|
|
if (!$cacheKey || !isset($this->caseShift_cache[$cacheKey])) {
|
|
reset($str);
|
|
foreach ($str as $k => $v) {
|
|
if (!$flag) {
|
|
$str[$k] = strtoupper($v);
|
|
}
|
|
}
|
|
if ($cacheKey) $this->caseShift_cache[$cacheKey]=$str;
|
|
} else {
|
|
$str = $this->caseShift_cache[$cacheKey];
|
|
}
|
|
} elseif (!$flag) { $str = strtoupper($str); }
|
|
return $str;
|
|
}
|
|
|
|
/**
|
|
* Compiling an array with tag attributes into a string
|
|
*
|
|
* @param array Tag attributes
|
|
* @param array Meta information about these attributes (like if they were quoted)
|
|
* @param boolean If set, then the attribute names will be set in lower case, value quotes in double-quotes and the value will be htmlspecialchar()'ed
|
|
* @return string Imploded attributes, eg: 'attribute="value" attrib2="value2"'
|
|
* @access private
|
|
*/
|
|
function compileTagAttribs($tagAttrib,$meta=array(), $xhtmlClean=0) {
|
|
$accu=array();
|
|
foreach ($tagAttrib as $k =>$v) {
|
|
if ($xhtmlClean) {
|
|
$attr=strtolower($k);
|
|
if (strcmp($v,'') || isset($meta[$k]['dashType'])) {
|
|
$attr.='="'.htmlspecialchars($v).'"';
|
|
}
|
|
} else {
|
|
$attr=$meta[$k]['origTag']?$meta[$k]['origTag']:$k;
|
|
if (strcmp($v,'') || isset($meta[$k]['dashType'])) {
|
|
$dash=$meta[$k]['dashType']?$meta[$k]['dashType']:(t3lib_div::testInt($v)?'':'"');
|
|
$attr.='='.$dash.$v.$dash;
|
|
}
|
|
}
|
|
$accu[]=$attr;
|
|
}
|
|
return implode(' ',$accu);
|
|
}
|
|
|
|
/**
|
|
* Get tag attributes, the classic version (which had some limitations?)
|
|
*
|
|
* @param string The tag
|
|
* @param boolean De-htmlspecialchar flag.
|
|
* @return array
|
|
* @access private
|
|
*/
|
|
function get_tag_attributes_classic($tag,$deHSC=0) {
|
|
$attr=$this->get_tag_attributes($tag,$deHSC);
|
|
return is_array($attr[0])?$attr[0]:array();
|
|
}
|
|
|
|
/**
|
|
* Indents input content with $number instances of $indentChar
|
|
*
|
|
* @param string Content string, multiple lines.
|
|
* @param integer Number of indents
|
|
* @param string Indent character/string
|
|
* @return string Indented code (typ. HTML)
|
|
*/
|
|
function indentLines($content, $number=1, $indentChar="\t") {
|
|
$preTab = str_pad('', $number*strlen($indentChar), $indentChar);
|
|
$lines = explode(chr(10),str_replace(chr(13),'',$content));
|
|
foreach ($lines as $k => $v) {
|
|
$lines[$k] = $preTab.$v;
|
|
}
|
|
return implode(chr(10), $lines);
|
|
}
|
|
|
|
/**
|
|
* Converts TSconfig into an array for the HTMLcleaner function.
|
|
*
|
|
* @param array TSconfig for HTMLcleaner
|
|
* @param array Array of tags to keep (?)
|
|
* @return array
|
|
* @access private
|
|
*/
|
|
function HTMLparserConfig($TSconfig,$keepTags=array()) {
|
|
// Allow tags (base list, merged with incoming array)
|
|
$alTags = array_flip(t3lib_div::trimExplode(',',strtolower($TSconfig['allowTags']),1));
|
|
$keepTags = array_merge($alTags,$keepTags);
|
|
|
|
// Set config properties.
|
|
if (is_array($TSconfig['tags.'])) {
|
|
reset($TSconfig['tags.']);
|
|
while(list($key,$tagC)=each($TSconfig['tags.'])) {
|
|
if (!is_array($tagC) && $key==strtolower($key)) {
|
|
if (!strcmp($tagC,'0')) unset($keepTags[$key]);
|
|
if (!strcmp($tagC,'1') && !isset($keepTags[$key])) $keepTags[$key]=1;
|
|
}
|
|
}
|
|
|
|
reset($TSconfig['tags.']);
|
|
foreach ($TSconfig['tags.'] as $key => $tagC) {
|
|
if (is_array($tagC) && $key==strtolower($key)) {
|
|
$key=substr($key,0,-1);
|
|
if (!is_array($keepTags[$key])) $keepTags[$key]=array();
|
|
if (is_array($tagC['fixAttrib.'])) {
|
|
reset($tagC['fixAttrib.']);
|
|
while(list($atName,$atConfig)=each($tagC['fixAttrib.'])) {
|
|
if (is_array($atConfig)) {
|
|
$atName=substr($atName,0,-1);
|
|
if (!is_array($keepTags[$key]['fixAttrib'][$atName])) {
|
|
$keepTags[$key]['fixAttrib'][$atName]=array();
|
|
}
|
|
$keepTags[$key]['fixAttrib'][$atName] = array_merge($keepTags[$key]['fixAttrib'][$atName],$atConfig); // Candidate for t3lib_div::array_merge() if integer-keys will some day make trouble...
|
|
if (strcmp($keepTags[$key]['fixAttrib'][$atName]['range'],'')) $keepTags[$key]['fixAttrib'][$atName]['range'] = t3lib_div::trimExplode(',',$keepTags[$key]['fixAttrib'][$atName]['range']);
|
|
if (strcmp($keepTags[$key]['fixAttrib'][$atName]['list'],'')) $keepTags[$key]['fixAttrib'][$atName]['list'] = t3lib_div::trimExplode(',',$keepTags[$key]['fixAttrib'][$atName]['list']);
|
|
}
|
|
}
|
|
}
|
|
unset($tagC['fixAttrib.']);
|
|
unset($tagC['fixAttrib']);
|
|
$keepTags[$key] = array_merge($keepTags[$key],$tagC); // Candidate for t3lib_div::array_merge() if integer-keys will some day make trouble...
|
|
}
|
|
}
|
|
}
|
|
// localNesting
|
|
if ($TSconfig['localNesting']) {
|
|
$lN = t3lib_div::trimExplode(',',strtolower($TSconfig['localNesting']),1);
|
|
while(list(,$tn)=each($lN)) {
|
|
if (isset($keepTags[$tn])) {
|
|
$keepTags[$tn]['nesting']=1;
|
|
}
|
|
}
|
|
}
|
|
if ($TSconfig['globalNesting']) {
|
|
$lN = t3lib_div::trimExplode(',',strtolower($TSconfig['globalNesting']),1);
|
|
while(list(,$tn)=each($lN)) {
|
|
if (isset($keepTags[$tn])) {
|
|
if (!is_array($keepTags[$tn])) $keepTags[$tn]=array();
|
|
$keepTags[$tn]['nesting']='global';
|
|
}
|
|
}
|
|
}
|
|
if ($TSconfig['rmTagIfNoAttrib']) {
|
|
$lN = t3lib_div::trimExplode(',',strtolower($TSconfig['rmTagIfNoAttrib']),1);
|
|
while(list(,$tn)=each($lN)) {
|
|
if (isset($keepTags[$tn])) {
|
|
if (!is_array($keepTags[$tn])) $keepTags[$tn]=array();
|
|
$keepTags[$tn]['rmTagIfNoAttrib']=1;
|
|
}
|
|
}
|
|
}
|
|
if ($TSconfig['noAttrib']) {
|
|
$lN = t3lib_div::trimExplode(',',strtolower($TSconfig['noAttrib']),1);
|
|
while(list(,$tn)=each($lN)) {
|
|
if (isset($keepTags[$tn])) {
|
|
if (!is_array($keepTags[$tn])) $keepTags[$tn]=array();
|
|
$keepTags[$tn]['allowedAttribs']=0;
|
|
}
|
|
}
|
|
}
|
|
if ($TSconfig['removeTags']) {
|
|
$lN = t3lib_div::trimExplode(',',strtolower($TSconfig['removeTags']),1);
|
|
while(list(,$tn)=each($lN)) {
|
|
$keepTags[$tn]=array();
|
|
$keepTags[$tn]['allowedAttribs']=0;
|
|
$keepTags[$tn]['rmTagIfNoAttrib']=1;
|
|
}
|
|
}
|
|
|
|
// Create additional configuration:
|
|
$addConfig=array();
|
|
if ($TSconfig['xhtml_cleaning']) {
|
|
$addConfig['xhtml']=1;
|
|
}
|
|
|
|
return array(
|
|
$keepTags,
|
|
''.$TSconfig['keepNonMatchedTags'],
|
|
intval($TSconfig['htmlSpecialChars']),
|
|
$addConfig
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Tries to convert the content to be XHTML compliant and other stuff like that.
|
|
* STILL EXPERIMENTAL. See comments below.
|
|
*
|
|
* What it does NOT do (yet) according to XHTML specs.:
|
|
* - Wellformedness: Nesting is NOT checked
|
|
* - name/id attribute issue is not observed at this point.
|
|
* - Certain nesting of elements not allowed. Most interesting, <PRE> cannot contain img, big,small,sub,sup ...
|
|
* - Wrapping scripts and style element contents in CDATA - or alternatively they should have entitites converted.
|
|
* - Setting charsets may put some special requirements on both XML declaration/ meta-http-equiv. (C.9)
|
|
* - UTF-8 encoding is in fact expected by XML!!
|
|
* - stylesheet element and attribute names are NOT converted to lowercase
|
|
* - ampersands (and entities in general I think) MUST be converted to an entity reference! (&s;). This may mean further conversion of non-tag content before output to page. May be related to the charset issue as a whole.
|
|
* - Minimized values not allowed: Must do this: selected="selected"
|
|
*
|
|
* What it does at this point:
|
|
* - All tags (frame,base,meta,link + img,br,hr,area,input) is ended with "/>" - others?
|
|
* - Lowercase for elements and attributes
|
|
* - All attributes in quotes
|
|
* - Add "alt" attribute to img-tags if it's not there already.
|
|
*
|
|
* @param string Content to clean up
|
|
* @return string Cleaned up content returned.
|
|
* @access private
|
|
*/
|
|
function XHTML_clean($content) {
|
|
$content = $this->HTMLcleaner(
|
|
$content,
|
|
array(), // No tags treated specially
|
|
1, // Keep ALL tags.
|
|
0, // All content is htmlspecialchar()'ed (or ??) - if we do, <script> content will break...
|
|
array('xhtml' => 1)
|
|
);
|
|
return $content;
|
|
}
|
|
|
|
/**
|
|
* Processing all tags themselves
|
|
* (Some additions by Sacha Vorbeck)
|
|
*
|
|
* @param string Tag to process
|
|
* @param array Configuration array passing instructions for processing. If count()==0, function will return value unprocessed. See source code for details
|
|
* @param boolean Is endtag, then set this.
|
|
* @param boolean If set, just return value straight away
|
|
* @return string Processed value.
|
|
* @access private
|
|
*/
|
|
function processTag($value,$conf,$endTag,$protected=0) {
|
|
// Return immediately if protected or no parameters
|
|
if ($protected || !count($conf)) return $value;
|
|
// OK then, begin processing for XHTML output:
|
|
// STILL VERY EXPERIMENTAL!!
|
|
if ($conf['xhtml']) {
|
|
if ($endTag) { // Endtags are just set lowercase right away
|
|
$value = strtolower($value);
|
|
} elseif (substr($value,0,4)!='<!--') { // ... and comments are ignored.
|
|
$inValue = substr($value,1,(substr($value,-2)=='/>'?-2:-1)); // Finding inner value with out < >
|
|
list($tagName,$tagP)=preg_split('/\s+/s',$inValue,2); // Separate attributes and tagname
|
|
$tagName = strtolower($tagName);
|
|
|
|
// Process attributes
|
|
$tagAttrib = $this->get_tag_attributes($tagP);
|
|
if (!strcmp($tagName,'img') && !isset($tagAttrib[0]['alt'])) $tagAttrib[0]['alt']=''; // Set alt attribute for all images (not XHTML though...)
|
|
if (!strcmp($tagName,'script') && !isset($tagAttrib[0]['type'])) $tagAttrib[0]['type']='text/javascript'; // Set type attribute for all script-tags
|
|
$outA=array();
|
|
reset($tagAttrib[0]);
|
|
while(list($attrib_name,$attrib_value)=each($tagAttrib[0])) {
|
|
// Set attributes: lowercase, always in quotes, with htmlspecialchars converted.
|
|
$outA[]=$attrib_name.'="'.$this->bidir_htmlspecialchars($attrib_value,2).'"';
|
|
}
|
|
$newTag='<'.trim($tagName.' '.implode(' ',$outA));
|
|
// All tags that are standalone (not wrapping, not having endtags) should be ended with '/>'
|
|
if (t3lib_div::inList('img,br,hr,meta,link,base,area,input,param,col',$tagName) || substr($value,-2)=='/>') {
|
|
$newTag.=' />';
|
|
} else {
|
|
$newTag.='>';
|
|
}
|
|
$value = $newTag;
|
|
}
|
|
}
|
|
|
|
return $value;
|
|
}
|
|
|
|
/**
|
|
* Processing content between tags for HTML_cleaner
|
|
*
|
|
* @param string The value
|
|
* @param integer Direction, either -1 or +1. 0 (zero) means no change to input value.
|
|
* @param mixed Not used, ignore.
|
|
* @return string The processed value.
|
|
* @access private
|
|
*/
|
|
function processContent($value,$dir,$conf) {
|
|
if ($dir!=0) $value = $this->bidir_htmlspecialchars($value,$dir);
|
|
return $value;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_parsehtml.php']) {
|
|
include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_parsehtml.php']);
|
|
}
|
|
?>
|