Feature #19899 » class.external_parser.php_new.patch
class.external_parser.php (.../trunk/indexed_search/class.external_parser.php) (revision 30) | ||
---|---|---|
* @return boolean Returns true if extension is supported/enabled, otherwise false.
|
||
*/
|
||
function initParser($extension) {
|
||
// Then read indexer-config and set if appropriate:
|
||
$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
|
||
... | ... | |
} else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
|
||
} else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
|
||
break;
|
||
case 'doc':
|
||
// Catdoc
|
||
case 'dot': // MS Word
|
||
case 'doc': // MS Word
|
||
// Catdoc
|
||
if ($indexerConfig['catdoc']) {
|
||
$catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
|
||
if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
|
||
... | ... | |
} else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in path '".$catdocPath."catdoc'",3);
|
||
} else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
|
||
break;
|
||
case 'pot': // Powerpoint...
|
||
case 'pps': // MS PowerPoint(?)
|
||
case 'ppt': // MS PowerPoint
|
||
// ppthtml
|
||
... | ... | |
} else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
|
||
break;
|
||
case 'xls': // MS Excel
|
||
case 'xlt': // MS Excel Template
|
||
// Xlhtml
|
||
if ($indexerConfig['xlhtml']) {
|
||
$xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
|
||
... | ... | |
} else $this->pObj->log_setTSlogMessage("'unzip' tool for reading OpenOffice.org-files was not found in path '".$unzipPath."unzip'",3);
|
||
} else $this->pObj->log_setTSlogMessage('unzip tool (OpenOffice.org-files) disabled',1);
|
||
break;
|
||
case 'zip': //zip files
|
||
if ($indexerConfig['unzip']) {
|
||
$unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/';
|
||
if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) {
|
||
$this->app['unzip'] = $unzipPath.'unzip'.$exe;
|
||
$extOK = TRUE;
|
||
} else $this->pObj->log_setTSlogMessage("'unzip' tool for reading ZIP-files was not found in path '".$unzipPath."unzip'",3);
|
||
} else $this->pObj->log_setTSlogMessage('unzip tool (ZIP-files) disabled',1);
|
||
break;
|
||
case 'tar': // tarballs
|
||
case 'tgz': // gzip'ed TAR
|
||
case 'tbz': // bzip'ed TAR
|
||
if ($indexerConfig['tar']) {
|
||
$tarPath = preg_replace('/\/$/', '', $indexerConfig['tar']).'/';
|
||
if (ini_get('safe_mode') || @is_file($tarPath.'tar'.$exe)) {
|
||
$this->app['tar'] = $tarPath.'tar'.$exe;
|
||
$extOK = TRUE;
|
||
} else $this->pObj->log_setTSlogMessage("'tar' tool for reading tarballs was not found in path '".$tarPath."tar'", 3);
|
||
} else $this->pObj->log_setTSlogMessage('tar tool (tarballs) disabled', 1);
|
||
break;
|
||
case 'docx': // Microsoft Word 2007/2008
|
||
case 'dotx':
|
||
case 'xlsx': // Microsoft Excel 2007/2008
|
||
case 'xltx':
|
||
case 'pptx': // Microsoft PowerPoint
|
||
case 'ppsx':
|
||
case 'potx':
|
||
if ($indexerConfig['unzip']) {
|
||
$unzipPath = preg_replace('/\/$/', '', $indexerConfig['unzip']).'/';
|
||
if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) {
|
||
$this->app['unzip'] = $unzipPath.'unzip'.$exe;
|
||
$extOK = TRUE;
|
||
} else $this->pObj->log_setTSlogMessage("'unzip' tool for reading DOCX/XLSX/PPTX files was not found in path '".$unzipPath."unzip'",3);
|
||
} else $this->pObj->log_setTSlogMessage('unzip tool (docx/pptx/xlsx) disabled', 1);
|
||
break;
|
||
case 'rtf':
|
||
// Catdoc
|
||
if ($indexerConfig['unrtf']) {
|
||
... | ... | |
switch($extension) {
|
||
case 'pdf': // PDF
|
||
case 'doc': // MS Word files
|
||
case 'dot': // MS Word files
|
||
case 'pps': // MS PowerPoint
|
||
case 'ppt': // MS PowerPoint
|
||
case 'pot': // MS PowerPoint
|
||
case 'xls': // MS Excel
|
||
case 'xlt': // MS Excel
|
||
case 'sxc': // Open Office Calc.
|
||
case 'sxi': // Open Office Impress
|
||
case 'sxw': // Open Office Writer
|
||
case 'ods': // Oasis OpenDocument Spreadsheet
|
||
case 'odp': // Oasis OpenDocument Presentation
|
||
case 'odt': // Oasis OpenDocument Text
|
||
case 'zip': // ZIP
|
||
case 'tar': // TAR (TapeARchiver)
|
||
case 'tbz': // BZip'ed tarball
|
||
case 'tgz': // gzip'ed tarball
|
||
case 'docx': // \
|
||
case 'dotx': // \
|
||
case 'pptx': // -- Microsoft Office 2007/2008
|
||
case 'ppsx': // /
|
||
case':potx': // /
|
||
case 'xlsx': // /
|
||
case 'xltx': ///
|
||
case 'rtf': // RTF documents
|
||
case 'txt': // ASCII Text documents
|
||
case 'html': // HTML
|
||
... | ... | |
case 'csv': // Comma Separated Values
|
||
case 'xml': // Generic XML
|
||
case 'jpg': // Jpeg images (EXIF comment)
|
||
case 'jpeg': // Jpeg images (EXIF comment)
|
||
case 'jpeg': // Jpeg images (EXIF comment)
|
||
case 'tif': // TIF images (EXIF comment)
|
||
return TRUE;
|
||
break;
|
||
... | ... | |
}
|
||
break;
|
||
case 'doc':
|
||
case 'dot':
|
||
case 'docx':
|
||
case 'dotx':
|
||
// Catdoc
|
||
if ($indexerConfig['catdoc']) {
|
||
return 'MS Word';
|
||
... | ... | |
break;
|
||
case 'pps': // MS PowerPoint(?)
|
||
case 'ppt': // MS PowerPoint
|
||
// ppthtml
|
||
case 'pot': // MS PowerPoint
|
||
case 'pptx': // MS PowerPoint
|
||
case 'ppsx': // ...
|
||
case 'potx': //
|
||
// ppthtml
|
||
if ($indexerConfig['ppthtml']) {
|
||
return 'MS Powerpoint';
|
||
}
|
||
break;
|
||
case 'xls': // MS Excel
|
||
case 'xlt': // MS Excel
|
||
case 'xlsx': // MS Excel
|
||
case 'xltx': // ...
|
||
// Xlhtml
|
||
if ($indexerConfig['xlhtml']) {
|
||
return 'MS Excel';
|
||
... | ... | |
return 'Open Office';
|
||
}
|
||
break;
|
||
case 'zip': // ZIP
|
||
if ($indexerConfig['unzip']) {
|
||
return 'ZIP';
|
||
}
|
||
break;
|
||
case 'tar': // TAR
|
||
case 'tgz': // gzip'ed TAR
|
||
case 'tbz': // bzip'ed TAR
|
||
if ($indexerConfig['tar']) {
|
||
return 'tarball';
|
||
}
|
||
break;
|
||
case 'rtf':
|
||
// Catdoc
|
||
if ($indexerConfig['unrtf']) {
|
||
... | ... | |
unset($contentArr);
|
||
// Return immediately if initialization didn't set support up:
|
||
if (!$this->supportedExtensions[$ext]) return FALSE;
|
||
if (!$this->supportedExtensions[$ext]) {
|
||
return FALSE;
|
||
}
|
||
// Switch by file extension
|
||
switch ($ext) {
|
||
... | ... | |
// Get pdf content:
|
||
$tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
|
||
@unlink ($tempFileName); // Delete if exists, just to be safe.
|
||
$cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
|
||
$cmd = $this->app['pdftotext'].' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName;
|
||
exec($cmd);
|
||
if (@is_file($tempFileName)) {
|
||
$content = t3lib_div::getUrl($tempFileName);
|
||
... | ... | |
}
|
||
break;
|
||
case 'doc':
|
||
case 'dot':
|
||
if ($this->app['catdoc']) {
|
||
$cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile);
|
||
exec($cmd,$res);
|
||
... | ... | |
break;
|
||
case 'pps':
|
||
case 'ppt':
|
||
case 'pot':
|
||
if ($this->app['ppthtml']) {
|
||
$cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile);
|
||
exec($cmd,$res);
|
||
... | ... | |
}
|
||
break;
|
||
case 'xls':
|
||
case 'xlt':
|
||
if ($this->app['xlhtml']) {
|
||
$cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile);
|
||
exec($cmd,$res);
|
||
... | ... | |
}
|
||
}
|
||
break;
|
||
case 'docx':
|
||
case 'dotx':
|
||
case 'xlsx':
|
||
case 'xltx':
|
||
case 'pptx':
|
||
case 'ppsx':
|
||
case 'potx':
|
||
if($this->app['unzip']) {
|
||
$fb = FirePHP::getInstance(true);
|
||
// content
|
||
switch($ext) {
|
||
case 'docx':
|
||
case 'dotx':
|
||
$cmd = $this->app['unzip'] . ' -p '. escapeshellarg($absFile) . ' word/document.xml';
|
||
break;
|
||
case 'xlsx':
|
||
case 'xltx':
|
||
$cmd = $this->app['unzip'] . ' -p '. escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml';
|
||
break;
|
||
case 'ppsx':
|
||
case 'pptx':
|
||
case 'potx':
|
||
$cmd = $this->app['unzip'] . ' -p '. escapeshellarg($absFile) . ' ppt/slides/slide1.xml';
|
||
break;
|
||
}
|
||
exec($cmd,$res);
|
||
$content_xml = implode(chr(10),$res);
|
||
$fb->log($content_xml);
|
||
unset($res);
|
||
$utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml)));
|
||
$contentArr = $this->pObj->splitRegularContent($utf8_content);
|
||
// Metainformations:
|
||
$cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml';
|
||
exec($cmd,$res);
|
||
$core_xml = implode(chr(10),$res);
|
||
unset($res);
|
||
$coreContent = t3lib_div::xml2tree($core_xml);
|
||
$contentArr['title'] = basename($absFile);
|
||
$contentArr['title'] .= (string) ' '.$coreContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0];
|
||
$contentArr['description'] = (string) $coreContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0];
|
||
$contentArr['description'] .= ' '.$coreContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0];
|
||
$contentArr['description'] .= $coreContent['cp:coreProperties'][0]['ch']['dc:creator'][0]['values'][0];
|
||
$contentArr['keywords'] .= (string) $coreContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0];
|
||
}
|
||
break;
|
||
case 'zip':
|
||
if ($this->app['unzip']) {
|
||
// Extract files to temp-dir
|
||
$cmd = $this->app['unzip'].' -q -d '.PATH_site.'typo3temp/ ' . escapeshellarg($absFile);
|
||
exec($cmd, $res);
|
||
// Get the filenames, we know the path:
|
||
$cmd = $this->app['unzip'].' -Z -1 ' . escapeshellarg($absFile);
|
||
exec($cmd, $res);
|
||
$contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile);
|
||
foreach($res as $filename) {
|
||
$ext = pathinfo($filename);
|
||
$fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0);
|
||
$contentArr['title'] .= ' '.$fileContent['title'];
|
||
$contentArr['description'] .= ' '.$fileContent['description'];
|
||
$contentArr['keywords'] .= ' '.$fileContent['keywords'];
|
||
$contentArr['body'] .= ' '.$fileContent['body'];
|
||
unlink(PATH_site.'typo3temp/'.$filename);
|
||
}
|
||
}
|
||
break;
|
||
case 'tar':
|
||
if ($this->app['tar']) {
|
||
// Extract files to temp-dir and get the extracted filenames...
|
||
$cmd = $this->app['tar'].' xvf ' . escapeshellarg($absFile) . ' -C '.PATH_site.'typo3temp/';
|
||
exec($cmd, $res);
|
||
$contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile);
|
||
foreach($res as $filename) {
|
||
$ext = pathinfo($filename);
|
||
$fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0);
|
||
$contentArr['title'] .= ' '.$fileContent['title'];
|
||
$contentArr['description'] .= ' '.$fileContent['description'];
|
||
$contentArr['keywords'] .= ' '.$fileContent['keywords'];
|
||
$contentArr['body'] .= ' '.$fileContent['body'];
|
||
unlink(PATH_site.'typo3temp/'.$filename);
|
||
}
|
||
}
|
||
break;
|
||
case 'tgz':
|
||
if ($this->app['tar']) {
|
||
// Extract files to temp-dir and get the extracted filenames...
|
||
$cmd = $this->app['tar'].' xzvf ' . escapeshellarg($absFile) . ' -C '.PATH_site.'typo3temp/';
|
||
exec($cmd, $res);
|
||
$contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile);
|
||
foreach($res as $filename) {
|
||
$ext = pathinfo($filename);
|
||
$fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0);
|
||
$contentArr['title'] .= ' '.$fileContent['title'];
|
||
$contentArr['description'] .= ' '.$fileContent['description'];
|
||
$contentArr['keywords'] .= ' '.$fileContent['keywords'];
|
||
$contentArr['body'] .= ' '.$fileContent['body'];
|
||
unlink(PATH_site.'typo3temp/'.$filename);
|
||
}
|
||
}
|
||
break;
|
||
case 'tbz':
|
||
if ($this->app['tar']) {
|
||
// Extract files to temp-dir and get the extracted filenames...
|
||
$cmd = $this->app['tar'].' xjvf ' . escapeshellarg($absFile) . ' -C '.PATH_site.'typo3temp/';
|
||
exec($cmd, $res);
|
||
$contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile);
|
||
foreach($res as $filename) {
|
||
$ext = pathinfo($filename);
|
||
$fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0);
|
||
$contentArr['title'] .= ' '.$fileContent['title'];
|
||
$contentArr['description'] .= ' '.$fileContent['description'];
|
||
$contentArr['keywords'] .= ' '.$fileContent['keywords'];
|
||
$contentArr['body'] .= ' '.$fileContent['body'];
|
||
unlink(PATH_site.'typo3temp/'.$filename);
|
||
}
|
||
}
|
||
break;
|
||
|
||
case 'rtf':
|
||
if ($this->app['unrtf']) {
|
||
$cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile);
|
||
... | ... | |
switch ($ext) {
|
||
case 'pdf':
|
||
// Getting pdf-info:
|
||
$cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile);
|
||
$cmd = $this->app['pdfinfo'].' ' . escapeshellarg($absFile);
|
||
exec($cmd,$res);
|
||
$pdfInfo = $this->splitPdfInfo($res);
|
||
unset($res);
|
||
... | ... | |
if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
|
||
include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
|
||
}
|
||
?>
|
||
?>
|