Bug #22296 » 13858_v6.diff
typo3/sysext/indexed_search/class.indexer.php (working copy) | ||
---|---|---|
$qParts = parse_url($linkSource); // parse again due to new linkSource!
|
||
}
|
||
if ($qParts['scheme']) {
|
||
if (!$linkInfo['localPath'] && $qParts['scheme']) {
|
||
if ($this->indexerConfig['indexExternalURLs']) {
|
||
// Index external URL (http or otherwise)
|
||
$this->indexExternalUrl($linkSource);
|
||
... | ... | |
}
|
||
/**
|
||
* Extracts all links to external documents from content string.
|
||
* Extracts all links to external documents from the HTML content string
|
||
*
|
||
* @param string Content to analyse
|
||
* @return array Array of hyperlinks
|
||
* @param string $html
|
||
* @return array Array of hyperlinks (keys: tag, href, localPath (empty if not local))
|
||
* @see extractLinks()
|
||
*/
|
||
function extractHyperLinks($string) {
|
||
if (!is_object($this->htmlParser)) {
|
||
$this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
|
||
}
|
||
function extractHyperLinks($html) {
|
||
$htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
|
||
$htmlParts = $htmlParser->splitTags('a', $html);
|
||
$hyperLinksData = array();
|
||
foreach ($htmlParts as $index => $tagData) {
|
||
if (($index % 2) !== 0) {
|
||
$tagAttributes = $htmlParser->get_tag_attributes($tagData, TRUE);
|
||
$firstTagName = $htmlParser->getFirstTagName($tagData);
|
||
$parts = $this->htmlParser->splitTags('a',$string);
|
||
$list = array();
|
||
foreach ($parts as $k => $v) {
|
||
if ($k%2) {
|
||
$params = $this->htmlParser->get_tag_attributes($v,1);
|
||
$firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
|
||
switch (strtolower($firstTagName)) {
|
||
case 'a':
|
||
$src = $params[0]['href'];
|
||
if ($src) {
|
||
// Check if a local path to that file has been set - useful if you are using a download script.
|
||
$md5 = t3lib_div::shortMD5($src);
|
||
if (is_array($indexLocalFiles=$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'])) {
|
||
$localPath = isset($indexLocalFiles[$md5]) ? $indexLocalFiles[$md5] : '';
|
||
} else $localPath=false;
|
||
$list[] = array(
|
||
'tag' => $v,
|
||
'href' => $params[0]['href'],
|
||
'localPath' => $localPath
|
||
if (strtolower($firstTagName) == 'a') {
|
||
if ($tagAttributes[0]['href'] && $tagAttributes[0]['href']{0} != '#') {
|
||
$hyperLinksData[] = array(
|
||
'tag' => $tagData,
|
||
'href' => $tagAttributes[0]['href'],
|
||
'localPath' => $this->createLocalPath($tagAttributes[0]['href'])
|
||
);
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
return $list;
|
||
return $hyperLinksData;
|
||
}
|
||
/**
|
||
... | ... | |
* @param string Content to analyze
|
||
* @return string The base href or an empty string if not found
|
||
*/
|
||
public function extractBaseHref($string) {
|
||
if (!is_object($this->htmlParser)) {
|
||
$this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
|
||
}
|
||
$parts = $this->htmlParser->splitTags('base', $string);
|
||
foreach ($parts as $key => $value) {
|
||
if ($key % 2) {
|
||
$params = $this->htmlParser->get_tag_attributes($value, 1);
|
||
$firstTagName = $this->htmlParser->getFirstTagName($value); // The 'name' of the first tag
|
||
switch (strtolower($firstTagName)) {
|
||
case 'base':
|
||
$href = $params[0]['href'];
|
||
public function extractBaseHref($html) {
|
||
$href = '';
|
||
$htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
|
||
$htmlParts = $htmlParser->splitTags('base', $html);
|
||
foreach ($htmlParts as $index => $tagData) {
|
||
if (($index % 2) !== 0) {
|
||
$tagAttributes = $htmlParser->get_tag_attributes($tagData, true);
|
||
$firstTagName = $htmlParser->getFirstTagName($tagData);
|
||
if (strtolower($firstTagName) == 'base') {
|
||
$href = $tagAttributes[0]['href'];
|
||
if ($href) {
|
||
// Return the first "base href" found (a single one should be present anyway)
|
||
return $href;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return '';
|
||
return $href;
|
||
}
|
||
/******************************************
|
||
*
|
||
* Indexing; external URL
|
||
... | ... | |
/**
|
||
* Checks if the file is local
|
||
*
|
||
* @param $sourcePath
|
||
* @return string Absolute path to file if file is local, else empty string
|
||
*/
|
||
protected function createLocalPath($sourcePath) {
|
||
static $pathFunctions = array(
|
||
'createLocalPathFromT3vars',
|
||
'createLocalPathUsingAbsRefPrefix',
|
||
'createLocalPathUsingDomainURL',
|
||
'createLocalPathFromAbsoluteURL',
|
||
'createLocalPathFromRelativeURL'
|
||
);
|
||
foreach ($pathFunctions as $functionName) {
|
||
$localPath = $this->$functionName($sourcePath);
|
||
if ($localPath != '') {
|
||
break;
|
||
}
|
||
}
|
||
return $localPath;
|
||
}
|
||
/**
|
||
* Attempts to create a local file path from T3VARs. This is useful for
|
||
* various download extensions that hide actual file name but still want the
|
||
* file to be indexed.
|
||
*
|
||
* @param string $sourcePath
|
||
* @return string
|
||
*/
|
||
protected function createLocalPathFromT3vars($sourcePath) {
|
||
$localPath = '';
|
||
$indexLocalFiles = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
|
||
if (is_array($indexLocalFiles)) {
|
||
$md5 = t3lib_div::shortMD5($sourcePath);
|
||
// Note: not using self::isAllowedLocalFile here because this method
|
||
// is allowed to index files outside of the web site (for example,
|
||
// protected downloads)
|
||
if (isset($indexLocalFiles[$md5]) && is_file($indexLocalFiles[$md5])) {
|
||
$localPath = $indexLocalFiles[$md5];
|
||
}
|
||
}
|
||
return $localPath;
|
||
}
|
||
/**
|
||
* Attempts to create a local file path by matching a current request URL.
|
||
*
|
||
* @param string $sourcePath
|
||
* @return string
|
||
*/
|
||
protected function createLocalPathUsingDomainURL($sourcePath) {
|
||
$baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
|
||
$baseURLLength = strlen($baseURL);
|
||
if (substr($sourcePath, 0, $baseURLLength) == $baseURL) {
|
||
$sourcePath = substr($sourcePath, $baseURLLength);
|
||
$localPath = PATH_site . $sourcePath;
|
||
if (!self::isAllowedLocalFile($localPath)) {
|
||
$localPath = '';
|
||
}
|
||
}
|
||
return $localPath;
|
||
}
|
||
/**
|
||
* Attempts to create a local file path by matching absRefPrefix. This
|
||
* requires TSFE. If TSFE is missing, this function does nothing.
|
||
*
|
||
* @param string $sourcePath
|
||
* @return string
|
||
*/
|
||
protected function createLocalPathUsingAbsRefPrefix($sourcePath) {
|
||
$localPath = '';
|
||
if ($GLOBALS['TSFE'] instanceof tslib_fe) {
|
||
$absRefPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
|
||
$absRefPrefixLength = strlen($absRefPrefix);
|
||
if ($absRefPrefixLength > 0 && substr($sourcePath, 0, $absRefPrefixLength) == $absRefPrefix) {
|
||
$sourcePath = substr($sourcePath, $absRefPrefixLength);
|
||
$localPath = PATH_site . $sourcePath;
|
||
if (!self::isAllowedLocalFile($localPath)) {
|
||
$localPath = '';
|
||
}
|
||
}
|
||
}
|
||
return $localPath;
|
||
}
|
||
/**
|
||
* Attempts to create a local file path from the absolute URL without
|
||
* schema.
|
||
*
|
||
* @param string $sourcePath
|
||
* @return string
|
||
*/
|
||
protected function createLocalPathFromAbsoluteURL($sourcePath) {
|
||
if ($sourcePath{0} == '/') {
|
||
$sourcePath = substr($sourcePath, 1);
|
||
$localPath = PATH_site . $sourcePath;
|
||
if (!self::isAllowedLocalFile($localPath)) {
|
||
$localPath = '';
|
||
}
|
||
}
|
||
return $localPath;
|
||
}
|
||
/**
|
||
* Attempts to create a local file path from the relative URL.
|
||
*
|
||
* @param string $sourcePath
|
||
* @return string
|
||
*/
|
||
protected function createLocalPathFromRelativeURL($sourcePath) {
|
||
if (self::isRelativeURL($sourcePath)) {
|
||
$localPath = PATH_site . $sourcePath;
|
||
if (!self::isAllowedLocalFile($localPath)) {
|
||
$localPath = '';
|
||
}
|
||
}
|
||
return $localPath;
|
||
}
|
||
/**
|
||
* Checks if URL is relative.
|
||
*
|
||
* @param string $url
|
||
* @return boolean
|
||
*/
|
||
static protected function isRelativeURL($url) {
|
||
$urlParts = @parse_url($url);
|
||
return ($urlParts['scheme'] == '' && $urlParts['path']{0} != '/');
|
||
}
|
||
/**
|
||
* Checks if the path points to the file inside the web site
|
||
*
|
||
* @param string $filePath
|
||
* @return boolean
|
||
*/
|
||
static protected function isAllowedLocalFile($filePath) {
|
||
$filePath = t3lib_div::resolveBackPath($filePath);
|
||
$insideWebPath = (substr($filePath, 0, strlen(PATH_site)) == PATH_site);
|
||
$isFile = is_file($filePath);
|
||
return $insideWebPath && $isFile;
|
||
}
|
||
/******************************************
|
||
*
|
||
* Indexing; external files (PDF, DOC, etc)
|
typo3/sysext/indexed_search/tests/tx_indexedsearch_indexer_testcase.php (revision 0) | ||
---|---|---|
<?php
|
||
/***************************************************************
|
||
* Copyright notice
|
||
*
|
||
* (c) 2010 Dmitry Dulepov (dmitry.dulepov@gmail.com)
|
||
* All rights reserved
|
||
*
|
||
* This script is part of the Typo3 project. The Typo3 project is
|
||
* free software; you can redistribute it and/or modify
|
||
* it under the terms of the GNU General Public License as published by
|
||
* the Free Software Foundation; either version 2 of the License, or
|
||
* (at your option) any later version.
|
||
*
|
||
* The GNU General Public License can be found at
|
||
* http://www.gnu.org/copyleft/gpl.html.
|
||
*
|
||
* This script is distributed in the hope that it will be useful,
|
||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
* GNU General Public License for more details.
|
||
*
|
||
* This copyright notice MUST APPEAR in all copies of the script!
|
||
***************************************************************/
|
||
/**
|
||
* $Id$
|
||
*
|
||
*/
|
||
require_once(t3lib_extMgm::extPath('indexed_search', 'class.indexer.php'));
|
||
/**
|
||
* This class contains unit tests for the indexer
|
||
*
|
||
* @author Dmitry Dulepov <dmitry.dulepov@gmail.com>
|
||
* @author Christian Kuhn <lolli@schwarzbu.ch>
|
||
* @package TYPO3
|
||
* @subpackage tx_indexedsearch
|
||
*/
|
||
class tx_indexedsearch_indexer_testcase extends tx_phpunit_testcase {
|
||
/**
|
||
* Indexer instance
|
||
*
|
||
* @var tx_indexedsearch_indexer
|
||
*/
|
||
protected $indexer;
|
||
/**
|
||
* A name of the temporary file
|
||
*
|
||
* @var string
|
||
*/
|
||
protected $temporaryFileName = '';
|
||
/**
|
||
* Sets up the test
|
||
*
|
||
* @return void
|
||
*/
|
||
public function setUp() {
|
||
$this->indexer = t3lib_div::makeInstance('tx_indexedsearch_indexer');
|
||
}
|
||
/**
|
||
* Explicitly cleans up the indexer object to prevent any memory leaks
|
||
*
|
||
* @return void
|
||
*/
|
||
public function tearDown() {
|
||
unset($this->indexer);
|
||
if ($this->temporaryFileName) {
|
||
@unlink($this->temporaryFileName);
|
||
}
|
||
}
|
||
/**
|
||
* Checks that non-existing files are not returned
|
||
*
|
||
* @return void
|
||
*/
|
||
public function testNonExistingLocalPath() {
|
||
$html = 'test <a href="' . md5(uniqid('')) . '">test</a> test';
|
||
$result = $this->indexer->extractHyperLinks($html);
|
||
$this->assertEquals(1, count($result), 'Wrong number of parsed links');
|
||
$this->assertEquals($result[0]['localPath'], '', 'Local path is incorrect');
|
||
}
|
||
/**
|
||
* Checks that using t3vars returns correct file
|
||
*
|
||
* @return void
|
||
*/
|
||
public function testLocalPathWithT3Vars() {
|
||
$this->temporaryFileName = tempnam(sys_get_temp_dir(), 't3unit-');
|
||
$html = 'test <a href="testfile">test</a> test';
|
||
$savedValue = $GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'];
|
||
$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] = array(
|
||
t3lib_div::shortMD5('testfile') => $this->temporaryFileName
|
||
);
|
||
$result = $this->indexer->extractHyperLinks($html);
|
||
$GLOBALS['T3_VAR']['ext']['indexed_search']['indexLocalFiles'] = $savedValue;
|
||
$this->assertEquals(1, count($result), 'Wrong number of parsed links');
|
||
$this->assertEquals($result[0]['localPath'], $this->temporaryFileName, 'Local path is incorrect');
|
||
}
|
||
/**
|
||
* Tests that a path with baseURL
|
||
*
|
||
* @return void
|
||
*/
|
||
public function testLocalPathWithSiteURL() {
|
||
$baseURL = t3lib_div::getIndpEnv('TYPO3_SITE_URL');
|
||
$html = 'test <a href="' . $baseURL . 'index.php">test</a> test';
|
||
$result = $this->indexer->extractHyperLinks($html);
|
||
$this->assertEquals(1, count($result), 'Wrong number of parsed links');
|
||
$this->assertEquals($result[0]['localPath'], PATH_site . 'index.php', 'Local path is incorrect');
|
||
}
|
||
/**
|
||
* Tests absolute path
|
||
*
|
||
* @return void
|
||
*/
|
||
public function testRelativeLocalPath() {
|
||
$html = 'test <a href="index.php">test</a> test';
|
||
$result = $this->indexer->extractHyperLinks($html);
|
||
$this->assertEquals(1, count($result), 'Wrong number of parsed links');
|
||
$this->assertEquals($result[0]['localPath'], PATH_site . 'index.php', 'Local path is incorrect');
|
||
}
|
||
/**
|
||
* Tests absolute path.
|
||
*
|
||
* @return void
|
||
*/
|
||
public function testAbsoluteLocalPath() {
|
||
$path = substr(PATH_typo3, strlen(PATH_site) - 1);
|
||
$html = 'test <a href="' . $path . 'index.php">test</a> test';
|
||
$result = $this->indexer->extractHyperLinks($html);
|
||
$this->assertEquals(1, count($result), 'Wrong number of parsed links');
|
||
$this->assertEquals($result[0]['localPath'], PATH_typo3 . 'index.php', 'Local path is incorrect');
|
||
}
|
||
/**
|
||
* Tests that a path with the absRefPrefix returns correct result
|
||
*
|
||
* @return void
|
||
*/
|
||
public function testLocalPathWithAbsRefPrefix() {
|
||
$absRefPrefix = '/' . md5(uniqid(''));
|
||
$html = 'test <a href="' . $absRefPrefix . 'index.php">test</a> test';
|
||
$savedPrefix = $GLOBALS['TSFE']->config['config']['absRefPrefix'];
|
||
$GLOBALS['TSFE']->config['config']['absRefPrefix'] = $absRefPrefix;
|
||
$result = $this->indexer->extractHyperLinks($html);
|
||
$GLOBALS['TSFE']->config['config']['absRefPrefix'] = $savedPrefix;
|
||
$this->assertEquals(1, count($result), 'Wrong number of parsed links');
|
||
$this->assertEquals($result[0]['localPath'], PATH_site . 'index.php', 'Local path is incorrect');
|
||
}
|
||
/**
|
||
* Checks that base HREF is extracted correctly
|
||
*
|
||
* @return void
|
||
*/
|
||
public function textExtractBaseHref() {
|
||
$baseHref = 'http://example.com/';
|
||
$html = '<html><head><Base Href="' . $baseHref . '" /></head></html>';
|
||
$result = $this->indexer->extractHyperLinks($html);
|
||
$this->assertEquals($baseHref, $result, 'Incorrect base href was extracted');
|
||
}
|
||
}
|
||
if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/tests/class.tx_indexedsearch_indexer_testcase.php']) {
|
||
include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/tests/class.tx_indexedsearch_indexer_testcase.php']);
|
||
}
|
||
?>
|