Bug #22229 » 13732.diff
class.indexer.php (working copy) | ||
---|---|---|
return $list;
|
||
}
|
||
/**
|
||
* Extracts the "base href" from content string.
|
||
*
|
||
* @param string Content to analyze
|
||
* @return string The base href or an empty string if not found
|
||
*/
|
||
public function extractBaseHref($string) {
|
||
if (!is_object($this->htmlParser)) {
|
||
$this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
|
||
}
|
||
$parts = $this->htmlParser->splitTags('base', $string);
|
||
foreach ($parts as $k => $v) {
|
||
if ($k % 2) {
|
||
$params = $this->htmlParser->get_tag_attributes($v, 1);
|
||
$firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
|
||
switch (strtolower($firstTagName)) {
|
||
case 'base':
|
||
$href = $params[0]['href'];
|
||
if ($href) {
|
||
// Return the first "base href" found (a single one should be present anyway)
|
||
return $href;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return '';
|
||
}
|
||
class.crawler.php (working copy) | ||
---|---|---|
$indexerObj->indexExternalUrl($url);
|
||
$url_qParts = parse_url($url);
|
||
$baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
|
||
if (!$baseHref) {
|
||
// Extract base href from current URL
|
||
$baseHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
|
||
$baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
|
||
}
|
||
$baseHref = rtrim($baseHref, '/');
|
||
// Get URLs on this page:
|
||
$subUrls = array();
|
||
$list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
|
||
... | ... | |
$qParts = parse_url($subUrl);
|
||
if (!$qParts['scheme']) {
|
||
$subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
|
||
$subUrl = $baseHref . '/' . t3lib_div::resolveBackPath($subUrl);
|
||
}
|
||
$subUrls[] = $subUrl;
|