Index: typo3/sysext/indexed_search/class.indexer.php =================================================================== --- typo3/sysext/indexed_search/class.indexer.php (revision 7383) +++ typo3/sysext/indexed_search/class.indexer.php (working copy) @@ -788,7 +788,7 @@ $qParts = parse_url($linkSource); // parse again due to new linkSource! } - if ($qParts['scheme']) { + if (!$linkInfo['localPath'] && $qParts['scheme']) { if ($this->indexerConfig['indexExternalURLs']) { // Index external URL (http or otherwise) $this->indexExternalUrl($linkSource); @@ -839,46 +839,34 @@ } /** - * Extracts all links to external documents from content string. + * Extracts all links to external documents from the HTML content string * - * @param string Content to analyse - * @return array Array of hyperlinks + * @param string $html + * @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local)) * @see extractLinks() */ - function extractHyperLinks($string) { - if (!is_object($this->htmlParser)) { - $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); - } + function extractHyperLinks($html) { + $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); + $htmlParts = $htmlParser->splitTags('a', $html); + $hyperLinksData = array(); + foreach ($htmlParts as $index => $tagData) { + if (($index % 2) !== 0) { + $tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE); + $firstTagName = $htmlParser->getFirstTagName($tagData); - $parts = $this->htmlParser->splitTags('a',$string); - $list = array(); - foreach ($parts as $k => $v) { - if ($k%2) { - $params = $this->htmlParser->get_tag_attributes($v,1); - $firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag - - switch (strtolower($firstTagName)) { - case 'a': - $src = $params[0]['href']; - if ($src) { - // Check if a local path to that file has been set - useful if you are using a download script. - $md5 = t3lib_div::shortMD5($src); - if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'])) { - $localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : ''; - } else $localPath=false; - - $list[] = array( - 'tag' => $v, - 'href' => $params[0]['href'], - 'localPath' => $localPath + if (strtolower($firstTagName) == 'a') { + if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') { + $hyperLinksData[] = array( + 'tag' => $tagData, + 'href' => $tagAttributes[0]['href'], + 'localPath' => $this->createLocalPath($tagAttributes[0]['href']) ); } - break; } } } - return $list; + return $hyperLinksData; } /** @@ -887,37 +875,26 @@ * @param string Content to analyze * @return string The base href or an empty string if not found */ - public function extractBaseHref($string) { - if (!is_object($this->htmlParser)) { - $this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); - } - - $parts = $this->htmlParser->splitTags('base', $string); - foreach ($parts as $key => $value) { - if ($key % 2) { - $params = $this->htmlParser->get_tag_attributes($value, 1); - $firstTagName = $this->htmlParser->getFirstTagName($value); // The 'name' of the first tag - - switch (strtolower($firstTagName)) { - case 'base': - $href = $params[0]['href']; + public function extractBaseHref($html) { + $href = ''; + $htmlParser = t3lib_div::makeInstance('t3lib_parseHtml'); + $htmlParts = $htmlParser->splitTags('base', $html); + foreach ($htmlParts as $index => $tagData) { + if (($index % 2) !== 0) { + $tagAttributes = $htmlParser->get_tag_attributes($tagData, true); + $firstTagName = $htmlParser->getFirstTagName($tagData); + if (strtolower($firstTagName) == 'base') { + $href = $tagAttributes[0]['href']; if ($href) { - // Return the first "base href" found (a single one should be present anyway) - return $href; + break; } } } } - return ''; + return $href; } - - - - - - /****************************************** * * Indexing; external URL @@ -985,16 +962,152 @@ + /** + * Checks if the file is local + * + * @param $sourcePath + * @return string Absolute path to file if file is local, else empty string + */ + protected function createLocalPath($sourcePath) { + static $pathFunctions = array( + 'createLocalPathFromT3vars', + 'createLocalPathUsingAbsRefPrefix', + 'createLocalPathUsingDomainURL', + 'createLocalPathFromAbsoluteURL', + 'createLocalPathFromRelativeURL' + ); + foreach ($pathFunctions as $functionName) { + $localPath = $this->$functionName($sourcePath); + if ($localPath != '') { + break; + } + } + return $localPath; + } + /** + * Attempts to create a local file path from T3VARs. This is useful for + * various download extensions that hide actual file name but still want the + * file to be indexed. + * + * @param string $sourcePath + * @return string + */ + protected function createLocalPathFromT3vars($sourcePath) { + $localPath = ''; + $indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles']; + if (is_array($indexLocalFiles)) { + $md5 = t3lib_div::shortMD5($sourcePath); + // Note: not using self::isAllowedLocalFile here because this method + // is allowed to index files outside of the web site (for example, + // protected downloads) + if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) { + $localPath = $indexLocalFiles[$md5]; + } + } + return $localPath; + } + /** + * Attempts to create a local file path by matching a current request URL. + * + * @param string $sourcePath + * @return string + */ + protected function createLocalPathUsingDomainURL($sourcePath) { + $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL'); + $baseURLLength = strlen($baseURL); + if (substr($sourcePath, 0, $baseURLLength) == $baseURL) { + $sourcePath = substr($sourcePath, $baseURLLength); + $localPath = PATH_site . $sourcePath; + if (!self::isAllowedLocalFile($localPath)) { + $localPath = ''; + } + } + return $localPath; + } + /** + * Attempts to create a local file path by matching absRefPrefix. This + * requires TSFE. If TSFE is missing, this function does nothing. + * + * @param string $sourcePath + * @return string + */ + protected function createLocalPathUsingAbsRefPrefix($sourcePath) { + $localPath = ''; + if ($GLOBALS['TSFE'] instanceof tslib_fe) { + $absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix']; + $absRefPrefixLength = strlen($absRefPrefix); + if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) { + $sourcePath = substr($sourcePath, $absRefPrefixLength); + $localPath = PATH_site . $sourcePath; + if (!self::isAllowedLocalFile($localPath)) { + $localPath = ''; + } + } + } + return $localPath; + } + /** + * Attempts to create a local file path from the absolute URL without + * schema. + * + * @param string $sourcePath + * @return string + */ + protected function createLocalPathFromAbsoluteURL($sourcePath) { + if ($sourcePath{0} == '/') { + $sourcePath = substr($sourcePath, 1); + $localPath = PATH_site . $sourcePath; + if (!self::isAllowedLocalFile($localPath)) { + $localPath = ''; + } + } + return $localPath; + } + /** + * Attempts to create a local file path from the relative URL. + * + * @param string $sourcePath + * @return string + */ + protected function createLocalPathFromRelativeURL($sourcePath) { + if (self::isRelativeURL($sourcePath)) { + $localPath = PATH_site . $sourcePath; + if (!self::isAllowedLocalFile($localPath)) { + $localPath = ''; + } + } + return $localPath; + } + /** + * Checks if URL is relative. + * + * @param string $url + * @return boolean + */ + static protected function isRelativeURL($url) { + $urlParts = @parse_url($url); + return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/'); + } + /** + * Checks if the path points to the file inside the web site + * + * @param string $filePath + * @return boolean + */ + static protected function isAllowedLocalFile($filePath) { + $filePath = t3lib_div::resolveBackPath($filePath); + $insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site); + $isFile = is_file($filePath); + return $insideWebPath && $isFile; + } - - /****************************************** * * Indexing; external files (PDF, DOC, etc) Index: typo3/sysext/indexed_search/tests/tx_indexedsearch_indexer_testcase.php =================================================================== --- typo3/sysext/indexed_search/tests/tx_indexedsearch_indexer_testcase.php (revision 0) +++ typo3/sysext/indexed_search/tests/tx_indexedsearch_indexer_testcase.php (revision 0) @@ -0,0 +1,183 @@ + + * @author Christian Kuhn + * @package TYPO3 + * @subpackage tx_indexedsearch + */ +class tx_indexedsearch_indexer_testcase extends tx_phpunit_testcase { + + /** + * Indexer instance + * + * @var tx_indexedsearch_indexer + */ + protected $indexer; + + /** + * A name of the temporary file + * + * @var string + */ + protected $temporaryFileName = ''; + + /** + * Sets up the test + * + * @return void + */ + public function setUp() { + $this->indexer = t3lib_div::makeInstance('tx_indexedsearch_indexer'); + } + + /** + * Explicitly cleans up the indexer object to prevent any memory leaks + * + * @return void + */ + public function tearDown() { + unset($this->indexer); + if ($this->temporaryFileName) { + @unlink($this->temporaryFileName); + } + } + + /** + * Checks that non-existing files are not returned + * + * @return void + */ + public function testNonExistingLocalPath() { + $html = 'test test test'; + $result = $this->indexer->extractHyperLinks($html); + + $this->assertEquals(1, count($result), 'Wrong number of parsed links'); + $this->assertEquals($result[0]['localPath'], '', 'Local path is incorrect'); + } + + /** + * Checks that using t3vars returns correct file + * + * @return void + */ + public function testLocalPathWithT3Vars() { + $this->temporaryFileName = tempnam(sys_get_temp_dir(), 't3unit-'); + $html = 'test test test'; + $savedValue = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles']; + $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] = array( + t3lib_div::shortMD5('testfile') => $this->temporaryFileName + ); + $result = $this->indexer->extractHyperLinks($html); + $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] = $savedValue; + + $this->assertEquals(1, count($result), 'Wrong number of parsed links'); + $this->assertEquals($result[0]['localPath'], $this->temporaryFileName, 'Local path is incorrect'); + } + + /** + * Tests that a path with baseURL + * + * @return void + */ + public function testLocalPathWithSiteURL() { + $baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL'); + $html = 'test test test'; + $result = $this->indexer->extractHyperLinks($html); + + $this->assertEquals(1, count($result), 'Wrong number of parsed links'); + $this->assertEquals($result[0]['localPath'], PATH_site . 'index.php', 'Local path is incorrect'); + } + + /** + * Tests absolute path + * + * @return void + */ + public function testRelativeLocalPath() { + $html = 'test test test'; + $result = $this->indexer->extractHyperLinks($html); + $this->assertEquals(1, count($result), 'Wrong number of parsed links'); + $this->assertEquals($result[0]['localPath'], PATH_site . 'index.php', 'Local path is incorrect'); + } + + /** + * Tests absolute path. + * + * @return void + */ + public function testAbsoluteLocalPath() { + $path = substr(PATH_typo3, strlen(PATH_site) - 1); + $html = 'test test test'; + $result = $this->indexer->extractHyperLinks($html); + + $this->assertEquals(1, count($result), 'Wrong number of parsed links'); + $this->assertEquals($result[0]['localPath'], PATH_typo3 . 'index.php', 'Local path is incorrect'); + } + + /** + * Tests that a path with the absRefPrefix returns correct result + * + * @return void + */ + public function testLocalPathWithAbsRefPrefix() { + $absRefPrefix = '/' . md5(uniqid('')); + $html = 'test test test'; + $savedPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix']; + $GLOBALS['TSFE']->config['config']['absRefPrefix'] = $absRefPrefix; + $result = $this->indexer->extractHyperLinks($html); + $GLOBALS['TSFE']->config['config']['absRefPrefix'] = $savedPrefix; + + $this->assertEquals(1, count($result), 'Wrong number of parsed links'); + $this->assertEquals($result[0]['localPath'], PATH_site . 'index.php', 'Local path is incorrect'); + } + + /** + * Checks that base HREF is extracted correctly + * + * @return void + */ + public function textExtractBaseHref() { + $baseHref = 'http://example.com/'; + $html = ''; + $result = $this->indexer->extractHyperLinks($html); + + $this->assertEquals($baseHref, $result, 'Incorrect base href was extracted'); + } +} + +if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/tests/class.tx_indexedsearch_indexer_testcase.php']) { + include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/tests/class.tx_indexedsearch_indexer_testcase.php']); +} + +?> \ No newline at end of file