Project

General

Profile

Feature #19899 » class.external_parser.php_new.patch

Administrator Admin, 2009-01-26 12:30

View differences:

class.external_parser.php (.../trunk/indexed_search/class.external_parser.php) (revision 30)
* @return boolean Returns true if extension is supported/enabled, otherwise false.
*/
function initParser($extension) {
// Then read indexer-config and set if appropriate:
$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
......
} else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
} else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
break;
case 'doc':
// Catdoc
case 'dot': // MS Word
case 'doc': // MS Word
// Catdoc
if ($indexerConfig['catdoc']) {
$catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
......
} else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in path '".$catdocPath."catdoc'",3);
} else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
break;
case 'pot': // Powerpoint...
case 'pps': // MS PowerPoint(?)
case 'ppt': // MS PowerPoint
// ppthtml
......
} else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
break;
case 'xls': // MS Excel
case 'xlt': // MS Excel Template
// Xlhtml
if ($indexerConfig['xlhtml']) {
$xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
......
} else $this->pObj->log_setTSlogMessage("'unzip' tool for reading OpenOffice.org-files was not found in path '".$unzipPath."unzip'",3);
} else $this->pObj->log_setTSlogMessage('unzip tool (OpenOffice.org-files) disabled',1);
break;
case 'zip': //zip files
if ($indexerConfig['unzip']) {
$unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/';
if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) {
$this->app['unzip'] = $unzipPath.'unzip'.$exe;
$extOK = TRUE;
} else $this->pObj->log_setTSlogMessage("'unzip' tool for reading ZIP-files was not found in path '".$unzipPath."unzip'",3);
} else $this->pObj->log_setTSlogMessage('unzip tool (ZIP-files) disabled',1);
break;
case 'tar': // tarballs
case 'tgz': // gzip'ed TAR
case 'tbz': // bzip'ed TAR
if ($indexerConfig['tar']) {
$tarPath = preg_replace('/\/$/', '', $indexerConfig['tar']).'/';
if (ini_get('safe_mode') || @is_file($tarPath.'tar'.$exe)) {
$this->app['tar'] = $tarPath.'tar'.$exe;
$extOK = TRUE;
} else $this->pObj->log_setTSlogMessage("'tar' tool for reading tarballs was not found in path '".$tarPath."tar'", 3);
} else $this->pObj->log_setTSlogMessage('tar tool (tarballs) disabled', 1);
break;
case 'docx': // Microsoft Word 2007/2008
case 'dotx':
case 'xlsx': // Microsoft Excel 2007/2008
case 'xltx':
case 'pptx': // Microsoft PowerPoint
case 'ppsx':
case 'potx':
if ($indexerConfig['unzip']) {
$unzipPath = preg_replace('/\/$/', '', $indexerConfig['unzip']).'/';
if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) {
$this->app['unzip'] = $unzipPath.'unzip'.$exe;
$extOK = TRUE;
} else $this->pObj->log_setTSlogMessage("'unzip' tool for reading DOCX/XLSX/PPTX files was not found in path '".$unzipPath."unzip'",3);
} else $this->pObj->log_setTSlogMessage('unzip tool (docx/pptx/xlsx) disabled', 1);
break;
case 'rtf':
// Catdoc
if ($indexerConfig['unrtf']) {
......
switch($extension) {
case 'pdf': // PDF
case 'doc': // MS Word files
case 'dot': // MS Word files
case 'pps': // MS PowerPoint
case 'ppt': // MS PowerPoint
case 'pot': // MS PowerPoint
case 'xls': // MS Excel
case 'xlt': // MS Excel
case 'sxc': // Open Office Calc.
case 'sxi': // Open Office Impress
case 'sxw': // Open Office Writer
case 'ods': // Oasis OpenDocument Spreadsheet
case 'odp': // Oasis OpenDocument Presentation
case 'odt': // Oasis OpenDocument Text
case 'zip': // ZIP
case 'tar': // TAR (TapeARchiver)
case 'tbz': // BZip'ed tarball
case 'tgz': // gzip'ed tarball
case 'docx': // \
case 'dotx': // \
case 'pptx': // -- Microsoft Office 2007/2008
case 'ppsx': // /
case':potx': // /
case 'xlsx': // /
case 'xltx': ///
case 'rtf': // RTF documents
case 'txt': // ASCII Text documents
case 'html': // HTML
......
case 'csv': // Comma Separated Values
case 'xml': // Generic XML
case 'jpg': // Jpeg images (EXIF comment)
case 'jpeg': // Jpeg images (EXIF comment)
case 'jpeg': // Jpeg images (EXIF comment)
case 'tif': // TIF images (EXIF comment)
return TRUE;
break;
......
}
break;
case 'doc':
case 'dot':
case 'docx':
case 'dotx':
// Catdoc
if ($indexerConfig['catdoc']) {
return 'MS Word';
......
break;
case 'pps': // MS PowerPoint(?)
case 'ppt': // MS PowerPoint
// ppthtml
case 'pot': // MS PowerPoint
case 'pptx': // MS PowerPoint
case 'ppsx': // ...
case 'potx': //
// ppthtml
if ($indexerConfig['ppthtml']) {
return 'MS Powerpoint';
}
break;
case 'xls': // MS Excel
case 'xlt': // MS Excel
case 'xlsx': // MS Excel
case 'xltx': // ...
// Xlhtml
if ($indexerConfig['xlhtml']) {
return 'MS Excel';
......
return 'Open Office';
}
break;
case 'zip': // ZIP
if ($indexerConfig['unzip']) {
return 'ZIP';
}
break;
case 'tar': // TAR
case 'tgz': // gzip'ed TAR
case 'tbz': // bzip'ed TAR
if ($indexerConfig['tar']) {
return 'tarball';
}
break;
case 'rtf':
// Catdoc
if ($indexerConfig['unrtf']) {
......
unset($contentArr);
// Return immediately if initialization didn't set support up:
if (!$this->supportedExtensions[$ext]) return FALSE;
if (!$this->supportedExtensions[$ext]) {
return FALSE;
}
// Switch by file extension
switch ($ext) {
......
// Get pdf content:
$tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
@unlink ($tempFileName); // Delete if exists, just to be safe.
$cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
$cmd = $this->app['pdftotext'].' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
exec($cmd);
if (@is_file($tempFileName)) {
$content = t3lib_div::getUrl($tempFileName);
......
}
break;
case 'doc':
case 'dot':
if ($this->app['catdoc']) {
$cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
exec($cmd,$res);
......
break;
case 'pps':
case 'ppt':
case 'pot':
if ($this->app['ppthtml']) {
$cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
exec($cmd,$res);
......
}
break;
case 'xls':
case 'xlt':
if ($this->app['xlhtml']) {
$cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
exec($cmd,$res);
......
}
}
break;
case 'docx':
case 'dotx':
case 'xlsx':
case 'xltx':
case 'pptx':
case 'ppsx':
case 'potx':
if($this->app['unzip']) {
$fb = FirePHP::getInstance(true);
// content
switch($ext) {
case 'docx':
case 'dotx':
$cmd = $this->app['unzip'] . ' -p '. escapeshellarg($absFile) . ' word/document.xml';
break;
case 'xlsx':
case 'xltx':
$cmd = $this->app['unzip'] . ' -p '. escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
break;
case 'ppsx':
case 'pptx':
case 'potx':
$cmd = $this->app['unzip'] . ' -p '. escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
break;
}
exec($cmd,$res);
$content_xml = implode(chr(10),$res);
$fb->log($content_xml);
unset($res);
$utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
$contentArr = $this->pObj->splitRegularContent($utf8_content);
// Metainformations:
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
exec($cmd,$res);
$core_xml = implode(chr(10),$res);
unset($res);
$coreContent = t3lib_div::xml2tree($core_xml);
$contentArr['title'] = basename($absFile);
$contentArr['title'] .= (string) ' '.$coreContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
$contentArr['description'] = (string) $coreContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
$contentArr['description'] .= ' '.$coreContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
$contentArr['description'] .= $coreContent['cp:coreProperties'][0]['ch']['dc:creator'][0]['values'][0];
$contentArr['keywords'] .= (string) $coreContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
}
break;
case 'zip':
if ($this->app['unzip']) {
// Extract files to temp-dir
$cmd = $this->app['unzip'].' -q -d '.PATH_site.'typo3temp/ ' . escapeshellarg($absFile);
exec($cmd, $res);
// Get the filenames, we know the path:
$cmd = $this->app['unzip'].' -Z -1 ' . escapeshellarg($absFile);
exec($cmd, $res);
$contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile);
foreach($res as $filename) {
$ext = pathinfo($filename);
$fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0);
$contentArr['title'] .= ' '.$fileContent['title'];
$contentArr['description'] .= ' '.$fileContent['description'];
$contentArr['keywords'] .= ' '.$fileContent['keywords'];
$contentArr['body'] .= ' '.$fileContent['body'];
unlink(PATH_site.'typo3temp/'.$filename);
}
}
break;
case 'tar':
if ($this->app['tar']) {
// Extract files to temp-dir and get the extracted filenames...
$cmd = $this->app['tar'].' xvf ' . escapeshellarg($absFile) . ' -C '.PATH_site.'typo3temp/';
exec($cmd, $res);
$contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile);
foreach($res as $filename) {
$ext = pathinfo($filename);
$fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0);
$contentArr['title'] .= ' '.$fileContent['title'];
$contentArr['description'] .= ' '.$fileContent['description'];
$contentArr['keywords'] .= ' '.$fileContent['keywords'];
$contentArr['body'] .= ' '.$fileContent['body'];
unlink(PATH_site.'typo3temp/'.$filename);
}
}
break;
case 'tgz':
if ($this->app['tar']) {
// Extract files to temp-dir and get the extracted filenames...
$cmd = $this->app['tar'].' xzvf ' . escapeshellarg($absFile) . ' -C '.PATH_site.'typo3temp/';
exec($cmd, $res);
$contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile);
foreach($res as $filename) {
$ext = pathinfo($filename);
$fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0);
$contentArr['title'] .= ' '.$fileContent['title'];
$contentArr['description'] .= ' '.$fileContent['description'];
$contentArr['keywords'] .= ' '.$fileContent['keywords'];
$contentArr['body'] .= ' '.$fileContent['body'];
unlink(PATH_site.'typo3temp/'.$filename);
}
}
break;
case 'tbz':
if ($this->app['tar']) {
// Extract files to temp-dir and get the extracted filenames...
$cmd = $this->app['tar'].' xjvf ' . escapeshellarg($absFile) . ' -C '.PATH_site.'typo3temp/';
exec($cmd, $res);
$contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile);
foreach($res as $filename) {
$ext = pathinfo($filename);
$fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0);
$contentArr['title'] .= ' '.$fileContent['title'];
$contentArr['description'] .= ' '.$fileContent['description'];
$contentArr['keywords'] .= ' '.$fileContent['keywords'];
$contentArr['body'] .= ' '.$fileContent['body'];
unlink(PATH_site.'typo3temp/'.$filename);
}
}
break;
case 'rtf':
if ($this->app['unrtf']) {
$cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
......
switch ($ext) {
case 'pdf':
// Getting pdf-info:
$cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
$cmd = $this->app['pdfinfo'].' ' . escapeshellarg($absFile);
exec($cmd,$res);
$pdfInfo = $this->splitPdfInfo($res);
unset($res);
......
if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
}
?>
?>
(4-4/5)