Project

General

Profile

Bug #22229 » 13732.diff

Administrator Admin, 2010-03-08 15:32

View differences:

class.indexer.php (working copy)
return $list;
}
/**
* Extracts the "base href" from content string.
*
* @param string Content to analyze
* @return string The base href or an empty string if not found
*/
public function extractBaseHref($string) {
if (!is_object($this->htmlParser)) {
$this->htmlParser = t3lib_div::makeInstance('t3lib_parseHtml');
}
$parts = $this->htmlParser->splitTags('base', $string);
foreach ($parts as $k => $v) {
if ($k % 2) {
$params = $this->htmlParser->get_tag_attributes($v, 1);
$firstTagName = $this->htmlParser->getFirstTagName($v); // The 'name' of the first tag
switch (strtolower($firstTagName)) {
case 'base':
$href = $params[0]['href'];
if ($href) {
// Return the first "base href" found (a single one should be present anyway)
return $href;
}
}
}
}
return '';
}
class.crawler.php (working copy)
$indexerObj->indexExternalUrl($url);
$url_qParts = parse_url($url);
$baseHref = $indexerObj->extractBaseHref($indexerObj->indexExternalUrl_content);
if (!$baseHref) {
// Extract base href from current URL
$baseHref = $url_qParts['scheme'] . '://' . $url_qParts['host'];
$baseHref .= substr($url_qParts['path'], 0, strrpos($url_qParts['path'], '/'));
}
$baseHref = rtrim($baseHref, '/');
// Get URLs on this page:
$subUrls = array();
$list = $indexerObj->extractHyperLinks($indexerObj->indexExternalUrl_content);
......
$qParts = parse_url($subUrl);
if (!$qParts['scheme']) {
$subUrl = $url_qParts['scheme'].'://'.$url_qParts['host'].'/'.t3lib_div::resolveBackPath($subUrl);
$subUrl = $baseHref . '/' . t3lib_div::resolveBackPath($subUrl);
}
$subUrls[] = $subUrl;
(1-1/2)