|
<?php
|
|
/***************************************************************
|
|
* Copyright notice
|
|
*
|
|
* (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)
|
|
* All rights reserved
|
|
*
|
|
* This script is part of the TYPO3 project. The TYPO3 project is
|
|
* free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* The GNU General Public License can be found at
|
|
* http://www.gnu.org/copyleft/gpl.html.
|
|
* A copy is found in the textfile GPL.txt and important notices to the license
|
|
* from the author is found in LICENSE.txt distributed with these scripts.
|
|
*
|
|
*
|
|
* This script is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* This copyright notice MUST APPEAR in all copies of the script!
|
|
***************************************************************/
|
|
/**
|
|
* External standard parsers for indexed_search
|
|
*
|
|
* @author Kasper Sk?rh?j <kasperYYYY@typo3.com>
|
|
* @coauthor Olivier Simah <noname_paris@yahoo.fr>
|
|
*/
|
|
/**
|
|
* [CLASS/FUNCTION INDEX of SCRIPT]
|
|
*
|
|
*
|
|
*
|
|
* 75: class tx_indexed_search_extparse
|
|
* 94: function initParser($extension)
|
|
* 227: function softInit($extension)
|
|
* 257: function searchTypeMediaTitle($extension)
|
|
* 330: function isMultiplePageExtension($extension)
|
|
*
|
|
* SECTION: Reading documents (for parsing)
|
|
* 361: function readFileContent($ext,$absFile,$cPKey)
|
|
* 541: function fileContentParts($ext,$absFile)
|
|
* 580: function splitPdfInfo($pdfInfoArray)
|
|
* 599: function removeEndJunk($string)
|
|
*
|
|
* SECTION: Backend analyzer
|
|
* 626: function getIcon($extension)
|
|
*
|
|
* TOTAL FUNCTIONS: 9
|
|
* (This index is automatically created/updated by the extension "extdeveval")
|
|
*
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
* External standard parsers for indexed_search
|
|
* MUST RETURN utf-8 content!
|
|
*
|
|
* @author Kasper Skaarhoj <kasperYYYY@typo3.com>
|
|
* @package TYPO3
|
|
* @subpackage tx_indexedsearch
|
|
*/
|
|
class tx_indexed_search_extparse {
|
|
|
|
// This value is also overridden from config.
|
|
var $pdf_mode = -20; // zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10
|
|
|
|
// This array is configured in initialization:
|
|
var $app = array();
|
|
var $ext2itemtype_map = array();
|
|
var $supportedExtensions = array();
|
|
|
|
var $pObj; // Reference to parent object (indexer class)
|
|
|
|
|
|
/**
|
|
* Initialize external parser for parsing content.
|
|
*
|
|
* @param string File extension
|
|
* @return boolean Returns true if extension is supported/enabled, otherwise false.
|
|
*/
|
|
function initParser($extension) {
|
|
|
|
// Then read indexer-config and set if appropriate:
|
|
$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
|
|
|
|
// If windows, apply extension to tool name:
|
|
$exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg
|
|
$extOK = FALSE;
|
|
$mainExtension = '';
|
|
|
|
// Ignore extensions
|
|
$ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
|
|
if (in_array($extension, $ignoreExtensions)) {
|
|
$this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
// Switch on file extension:
|
|
switch($extension) {
|
|
case 'pdf':
|
|
// PDF
|
|
if ($indexerConfig['pdftools']) {
|
|
$pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';
|
|
if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe))) {
|
|
$this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;
|
|
$this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;
|
|
// PDF mode:
|
|
$this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);
|
|
$extOK = TRUE;
|
|
} else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);
|
|
} else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);
|
|
break;
|
|
case 'doc':
|
|
// Catdoc
|
|
if ($indexerConfig['catdoc']) {
|
|
$catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';
|
|
if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) {
|
|
$this->app['catdoc'] = $catdocPath.'catdoc'.$exe;
|
|
$extOK = TRUE;
|
|
} else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3);
|
|
} else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);
|
|
break;
|
|
case 'pps': // MS PowerPoint(?)
|
|
case 'ppt': // MS PowerPoint
|
|
// ppthtml
|
|
if ($indexerConfig['ppthtml']) {
|
|
$ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';
|
|
if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){
|
|
$this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;
|
|
$extOK = TRUE;
|
|
} else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in paths '".$ppthtmlPath."ppthtml'",3);
|
|
} else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);
|
|
break;
|
|
case 'xls': // MS Excel
|
|
// Xlhtml
|
|
if ($indexerConfig['xlhtml']) {
|
|
$xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';
|
|
if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){
|
|
$this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;
|
|
$extOK = TRUE;
|
|
} else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in paths '".$xlhtmlPath."xlhtml'",3);
|
|
} else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);
|
|
break;
|
|
case 'sxc': // Open Office Calc.
|
|
case 'sxi': // Open Office Impress
|
|
case 'sxw': // Open Office Writer
|
|
// ooo_extract.rb can be found at: http://www.math.umd.edu/~dcarrera/openoffice/misc/tools/ooo_extract.html
|
|
// I had to run this on debian before I could run the ooo_extract.rb script:
|
|
// apt-get install libzlib-ruby1.8
|
|
// apt-get install librexml-ruby1.8
|
|
// ruby + ooo_extract
|
|
if ($indexerConfig['nativeOOMethod']) {
|
|
if (t3lib_extMgm::isLoaded('libunzipped')) {
|
|
$this->app['nativeOOMethod'] = TRUE;
|
|
$extOK = TRUE;
|
|
$this->pObj->log_setTSlogMessage('Using "libunzipped" for extraction of Open Office files, "'.$extension.'".',1);
|
|
} else $this->pObj->log_setTSlogMessage('The extension "libunzipped" was not loaded (for extraction of Open Office files, "'.$extension.'")',2);
|
|
} else {
|
|
if ($indexerConfig['OOoExtract']) {
|
|
if($indexerConfig['ruby']) { $rubyPath = ereg_replace('\/$','',$indexerConfig['ruby']).'/'; }
|
|
|
|
$oooExPath = ereg_replace('\/$','',$indexerConfig['OOoExtract']).'/';
|
|
if (ini_get('safe_mode') || (($rubyPath ? @is_file($rubyPath.'ruby'.$exe) : true) && @is_file($oooExPath.'ooo_extract.rb'))) {
|
|
$this->app['ruby'] = $rubyPath.'ruby'.$exe;
|
|
$this->app['OOo'] = $oooExPath.'ooo_extract.rb';
|
|
$extOK = TRUE;
|
|
} else $this->pObj->log_setTSlogMessage("'Ruby and OOo_extract' tools for reading OpenOffice.org documents were not found in paths '".$rubyPath."ruby".$exe."' OR '".$oooExPath."ooo_extract.rb'",3);
|
|
} else $this->pObj->log_setTSlogMessage('Ruby & OOo_extract tools (OpenOffice-files) disabled',1);
|
|
}
|
|
break;
|
|
case 'rtf':
|
|
// Catdoc
|
|
if ($indexerConfig['unrtf']) {
|
|
$unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';
|
|
if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe)) {
|
|
$this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;
|
|
$extOK = TRUE;
|
|
} else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in paths '".$unrtfPath."unrtf'",3);
|
|
} else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);
|
|
break;
|
|
case 'txt': // Raw text
|
|
case 'csv': // Raw text
|
|
case 'xml': // PHP strip-tags()
|
|
case 'tif': // PHP EXIF
|
|
$extOK = TRUE;
|
|
break;
|
|
case 'html': // PHP strip-tags()
|
|
case 'htm': // PHP strip-tags()
|
|
$extOK = TRUE;
|
|
$mainExtension = 'html'; // making "html" the common "item_type"
|
|
break;
|
|
case 'jpg': // PHP EXIF
|
|
case 'jpeg': // PHP EXIF
|
|
$extOK = TRUE;
|
|
$mainExtension = 'jpeg'; // making "jpeg" the common item_type
|
|
break;
|
|
}
|
|
|
|
// If extension was OK:
|
|
if ($extOK) {
|
|
$this->supportedExtensions[$extension] = TRUE;
|
|
$this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Initialize external parser for backend modules
|
|
* Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin
|
|
*
|
|
* @param string File extension to initialize for.
|
|
* @return boolean Returns true if the extension is supported and enabled, otherwise false.
|
|
*/
|
|
function softInit($extension) {
|
|
switch($extension) {
|
|
case 'pdf': // PDF
|
|
case 'doc': // MS Word files
|
|
case 'pps': // MS PowerPoint
|
|
case 'ppt': // MS PowerPoint
|
|
case 'xls': // MS Excel
|
|
case 'sxc': // Open Office Calc.
|
|
case 'sxi': // Open Office Impress
|
|
case 'sxw': // Open Office Writer
|
|
case 'rtf': // RTF documents
|
|
case 'txt': // ASCII Text documents
|
|
case 'html': // HTML
|
|
case 'htm': // HTML
|
|
case 'csv': // Comma Separated Values
|
|
case 'xml': // Generic XML
|
|
case 'jpg': // Jpeg images (EXIF comment)
|
|
case 'jpeg': // Jpeg images (EXIF comment)
|
|
case 'tif': // TIF images (EXIF comment)
|
|
return TRUE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Return title of entry in media type selector box.
|
|
*
|
|
* @param string File extension
|
|
* @return string String with label value of entry in media type search selector box (frontend plugin).
|
|
*/
|
|
function searchTypeMediaTitle($extension) {
|
|
|
|
// Read indexer-config
|
|
$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);
|
|
|
|
// Ignore extensions
|
|
$ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);
|
|
if (in_array($extension, $ignoreExtensions)) {
|
|
return FALSE;
|
|
}
|
|
|
|
// Switch on file extension:
|
|
switch($extension) {
|
|
case 'pdf':
|
|
// PDF
|
|
if ($indexerConfig['pdftools']) {
|
|
return 'PDF';
|
|
}
|
|
break;
|
|
case 'doc':
|
|
// Catdoc
|
|
if ($indexerConfig['catdoc']) {
|
|
return 'MS Word';
|
|
}
|
|
break;
|
|
case 'pps': // MS PowerPoint(?)
|
|
case 'ppt': // MS PowerPoint
|
|
// ppthtml
|
|
if ($indexerConfig['ppthtml']) {
|
|
return 'MS Powerpoint';
|
|
}
|
|
break;
|
|
case 'xls': // MS Excel
|
|
// Xlhtml
|
|
if ($indexerConfig['xlhtml']) {
|
|
return 'MS Excel';
|
|
}
|
|
break;
|
|
case 'sxc': // Open Office Calc.
|
|
case 'sxi': // Open Office Impress
|
|
case 'sxw': // Open Office Writer
|
|
if ($indexerConfig['nativeOOMethod'] || $indexerConfig['ruby']) {
|
|
return 'Open Office';
|
|
}
|
|
break;
|
|
case 'rtf':
|
|
// Catdoc
|
|
if ($indexerConfig['unrtf']) {
|
|
return 'RTF';
|
|
}
|
|
break;
|
|
case 'html': // PHP strip-tags()
|
|
case 'jpeg': // PHP EXIF
|
|
case 'txt': // Raw text
|
|
case 'csv': // Raw text
|
|
case 'xml': // PHP strip-tags()
|
|
case 'tif': // PHP EXIF
|
|
return strtoupper($extension);
|
|
break;
|
|
// NO entry (duplicates or blank):
|
|
case 'htm': // PHP strip-tags()
|
|
case 'jpg': // PHP EXIF
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns true if the input extension (item_type) is a potentially a multi-page extension
|
|
*
|
|
* @param string Extension / item_type string
|
|
* @return boolean Return true if multi-page
|
|
*/
|
|
function isMultiplePageExtension($extension) {
|
|
// Switch on file extension:
|
|
switch((string)$extension) {
|
|
case 'pdf':
|
|
return TRUE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/************************
|
|
*
|
|
* Reading documents (for parsing)
|
|
*
|
|
************************/
|
|
|
|
/**
|
|
* Reads the content of an external file being indexed.
|
|
*
|
|
* @param string File extension, eg. "pdf", "doc" etc.
|
|
* @param string Absolute filename of file (must exist and be validated OK before calling function)
|
|
* @param string Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)
|
|
* @return array Standard content array (title, description, keywords, body keys)
|
|
*/
|
|
function readFileContent($ext,$absFile,$cPKey) {
|
|
unset($contentArr);
|
|
|
|
// Return immediately if initialization didn't set support up:
|
|
if (!$this->supportedExtensions[$ext]) return FALSE;
|
|
|
|
// Switch by file extension
|
|
switch ($ext) {
|
|
case 'pdf':
|
|
if ($this->app['pdfinfo']) {
|
|
// Getting pdf-info:
|
|
// JOH: added double quotes which seem to work properly under windows
|
|
// and linux.. We need this, because there might be spaces in filenames
|
|
$cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
|
|
exec($cmd,$res);
|
|
$pdfInfo = $this->splitPdfInfo($res);
|
|
if (intval($pdfInfo['pages'])) {
|
|
list($low,$high) = explode('-',$cPKey);
|
|
|
|
// Get pdf content:
|
|
$tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name
|
|
@unlink ($tempFileName); // Delete if exists, just to be safe.
|
|
// JOH: added double quotes
|
|
$cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" "'.$tempFileName.'"';
|
|
exec($cmd,$res);
|
|
if (@is_file($tempFileName)) {
|
|
$content = t3lib_div::getUrl($tempFileName);
|
|
unlink($tempFileName);
|
|
} else {
|
|
$this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);
|
|
}
|
|
$contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
|
|
}
|
|
}
|
|
break;
|
|
case 'doc':
|
|
if ($this->app['catdoc']) {
|
|
// JOH: added double quotes around the filename
|
|
$cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"';
|
|
exec($cmd,$res);
|
|
$content = implode(chr(10),$res);
|
|
$contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));
|
|
}
|
|
break;
|
|
case 'pps':
|
|
case 'ppt':
|
|
if ($this->app['ppthtml']) {
|
|
$cmd = $this->app['ppthtml'].' "'.$absFile.'"';
|
|
exec($cmd,$res);
|
|
$content = implode(chr(10),$res);
|
|
$content = $this->pObj->convertHTMLToUtf8($content);
|
|
$contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
|
|
$contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
|
|
}
|
|
break;
|
|
case 'xls':
|
|
if ($this->app['xlhtml']) {
|
|
$cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"';
|
|
exec($cmd,$res);
|
|
$content = implode(chr(10),$res);
|
|
$content = $this->pObj->convertHTMLToUtf8($content);
|
|
$contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));
|
|
$contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
|
|
}
|
|
break;
|
|
case 'sxi':
|
|
case 'sxc':
|
|
case 'sxw':
|
|
if ($this->app['nativeOOMethod']) {
|
|
if (t3lib_extMgm::isLoaded('libunzipped')) {
|
|
|
|
global $TYPO3_CONF_VARS;
|
|
require_once(t3lib_extMgm::extPath('libunzipped').'class.tx_libunzipped.php');
|
|
|
|
// Initialize Unzip object:
|
|
$unzip = t3lib_div::makeInstance('tx_libunzipped');
|
|
$ooFiles = $unzip->init($absFile);
|
|
if (is_array($ooFiles)) {
|
|
// Read content.xml:
|
|
$content_xml = $unzip->getFileFromArchive('content.xml');
|
|
$meta_xml = $unzip->getFileFromArchive('meta.xml');
|
|
$utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml['content'])));
|
|
$contentArr = $this->pObj->splitRegularContent($utf8_content);
|
|
$contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
|
|
|
|
// Meta information
|
|
$metaContent = t3lib_div::xml2tree($meta_xml['content']);
|
|
$metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];
|
|
if (is_array($metaContent)) {
|
|
$contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];
|
|
$contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];
|
|
|
|
// Keywords collected:
|
|
if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword'])) {
|
|
foreach($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat) {
|
|
$contentArr['keywords'].= $kwDat['values'][0].' ';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
if ($this->app['ruby']) {
|
|
// Extracting document headers:
|
|
$cmd = $this->app['ruby'].' '.$this->app['OOo'].' --heading "'.$absFile.'"';
|
|
exec($cmd,$headings);
|
|
|
|
// Extracting document text:
|
|
$cmd = $this->app['ruby'].' '.$this->app['OOo'].' "'.$absFile.'"';
|
|
exec($cmd,$texts);
|
|
|
|
$content = implode(chr(10),$headings).' '.implode(chr(10),$texts);
|
|
$contentArr = $this->pObj->splitRegularContent($content);
|
|
$contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
|
|
}
|
|
}
|
|
break;
|
|
case 'rtf':
|
|
if ($this->app['unrtf']) {
|
|
$cmd = $this->app['unrtf'].' "'.$absFile.'"';
|
|
exec($cmd,$res);
|
|
$fileContent = implode(chr(10),$res);
|
|
$fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
|
|
$contentArr = $this->pObj->splitHTMLContent($fileContent);
|
|
}
|
|
break;
|
|
case 'txt':
|
|
case 'csv': // Raw text
|
|
$content = t3lib_div::getUrl($absFile);
|
|
// TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)
|
|
$content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');
|
|
$contentArr = $this->pObj->splitRegularContent($content);
|
|
$contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
|
|
break;
|
|
case 'html':
|
|
case 'htm':
|
|
$fileContent = t3lib_div::getUrl($absFile);
|
|
$fileContent = $this->pObj->convertHTMLToUtf8($fileContent);
|
|
$contentArr = $this->pObj->splitHTMLContent($fileContent);
|
|
break;
|
|
case 'xml': // PHP strip-tags()
|
|
$fileContent = t3lib_div::getUrl($absFile);
|
|
|
|
// Finding charset:
|
|
eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);
|
|
$charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';
|
|
|
|
// Converting content:
|
|
$fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);
|
|
$contentArr = $this->pObj->splitRegularContent($fileContent);
|
|
$contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
|
|
break;
|
|
case 'jpg': // PHP EXIF
|
|
case 'jpeg': // PHP EXIF
|
|
case 'tif': // PHP EXIF
|
|
$exif = exif_read_data($absFile, 'IFD0');
|
|
if ($exif) {
|
|
$comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']); // The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.
|
|
} else {
|
|
$comment = '';
|
|
}
|
|
$contentArr = $this->pObj->splitRegularContent($comment);
|
|
$contentArr['title'] = basename($absFile); // Make sure the title doesn't expose the absolute path!
|
|
break;
|
|
default:
|
|
return false;
|
|
break;
|
|
}
|
|
// If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.
|
|
if (is_array($contentArr) && !$contentArr['title']) {
|
|
$contentArr['title'] = str_replace('_',' ',basename($absFile)); // Substituting "_" for " " because many filenames may have this instead of a space char.
|
|
}
|
|
|
|
return $contentArr;
|
|
}
|
|
|
|
/**
|
|
* Creates an array with pointers to divisions of document.
|
|
* ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.
|
|
*
|
|
* @param string File extension
|
|
* @param string Absolute filename (must exist and be validated OK before calling function)
|
|
* @return array Array of pointers to sections that the document should be divided into
|
|
*/
|
|
function fileContentParts($ext,$absFile) {
|
|
$cParts = array(0);
|
|
switch ($ext) {
|
|
case 'pdf':
|
|
// Getting pdf-info:
|
|
$cmd = $this->app['pdfinfo'].' "'.$absFile.'"';
|
|
exec($cmd,$res);
|
|
$pdfInfo = $this->splitPdfInfo($res);
|
|
|
|
if (intval($pdfInfo['pages'])) {
|
|
$cParts = array();
|
|
|
|
// Calculate mode
|
|
if ($this->pdf_mode>0) {
|
|
$iter = ceil($pdfInfo['pages']/$this->pdf_mode);
|
|
} else {
|
|
$iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);
|
|
}
|
|
|
|
// Traverse and create intervals.
|
|
for ($a=0;$a<$iter;$a++) {
|
|
$low = floor($a*($pdfInfo['pages']/$iter))+1;
|
|
$high = floor(($a+1)*($pdfInfo['pages']/$iter));
|
|
$cParts[] = $low.'-'.$high;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
return $cParts;
|
|
}
|
|
|
|
/**
|
|
* Analysing PDF info into a useable format.
|
|
*
|
|
* @param array Array of PDF content, coming from the pdfinfo tool
|
|
* @return array Result array
|
|
* @access private
|
|
* @see fileContentParts()
|
|
*/
|
|
function splitPdfInfo($pdfInfoArray) {
|
|
$res = array();
|
|
if (is_array($pdfInfoArray)) {
|
|
foreach($pdfInfoArray as $line) {
|
|
$parts = explode(':',$line,2);
|
|
if (count($parts)>1 && trim($parts[0])) {
|
|
$res[strtolower(trim($parts[0]))] = trim($parts[1]);
|
|
}
|
|
}
|
|
}
|
|
return $res;
|
|
}
|
|
|
|
/**
|
|
* Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.
|
|
*
|
|
* @param string String to clean up
|
|
* @return string String
|
|
*/
|
|
function removeEndJunk($string) {
|
|
return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/************************
|
|
*
|
|
* Backend analyzer
|
|
*
|
|
************************/
|
|
|
|
/**
|
|
* Return icon for file extension
|
|
*
|
|
* @param string File extension, lowercase.
|
|
* @return string Relative file reference, resolvable by t3lib_div::getFileAbsFileName()
|
|
*/
|
|
function getIcon($extension) {
|
|
if ($extension=='htm') $extension = 'html';
|
|
if ($extension=='jpeg') $extension = 'jpg';
|
|
return 'EXT:indexed_search/pi/res/'.$extension.'.gif';
|
|
}
|
|
}
|
|
|
|
if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) {
|
|
include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);
|
|
}
|
|
?>
|