Index: class.external_parser.php =================================================================== --- class.external_parser.php (.../tags/typo3_2.11.1/class.external_parser.php) (revision 30) +++ class.external_parser.php (.../trunk/indexed_search/class.external_parser.php) (revision 30) @@ -92,7 +92,6 @@ * @return boolean Returns true if extension is supported/enabled, otherwise false. */ function initParser($extension) { - // Then read indexer-config and set if appropriate: $indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']); @@ -123,8 +122,9 @@ } else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3); } else $this->pObj->log_setTSlogMessage('PDF tools disabled',1); break; - case 'doc': - // Catdoc + case 'dot': // MS Word + case 'doc': // MS Word + // Catdoc if ($indexerConfig['catdoc']) { $catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/'; if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe)) { @@ -133,6 +133,7 @@ } else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in path '".$catdocPath."catdoc'",3); } else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1); break; + case 'pot': // Powerpoint... case 'pps': // MS PowerPoint(?) case 'ppt': // MS PowerPoint // ppthtml @@ -145,6 +146,7 @@ } else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1); break; case 'xls': // MS Excel + case 'xlt': // MS Excel Template // Xlhtml if ($indexerConfig['xlhtml']) { $xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/'; @@ -168,6 +170,46 @@ } else $this->pObj->log_setTSlogMessage("'unzip' tool for reading OpenOffice.org-files was not found in path '".$unzipPath."unzip'",3); } else $this->pObj->log_setTSlogMessage('unzip tool (OpenOffice.org-files) disabled',1); break; + + case 'zip': //zip files + if ($indexerConfig['unzip']) { + $unzipPath = preg_replace('/\/$/','',$indexerConfig['unzip']).'/'; + if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) { + $this->app['unzip'] = $unzipPath.'unzip'.$exe; + $extOK = TRUE; + } else $this->pObj->log_setTSlogMessage("'unzip' tool for reading ZIP-files was not found in path '".$unzipPath."unzip'",3); + } else $this->pObj->log_setTSlogMessage('unzip tool (ZIP-files) disabled',1); + break; + + case 'tar': // tarballs + case 'tgz': // gzip'ed TAR + case 'tbz': // bzip'ed TAR + if ($indexerConfig['tar']) { + $tarPath = preg_replace('/\/$/', '', $indexerConfig['tar']).'/'; + if (ini_get('safe_mode') || @is_file($tarPath.'tar'.$exe)) { + $this->app['tar'] = $tarPath.'tar'.$exe; + $extOK = TRUE; + } else $this->pObj->log_setTSlogMessage("'tar' tool for reading tarballs was not found in path '".$tarPath."tar'", 3); + } else $this->pObj->log_setTSlogMessage('tar tool (tarballs) disabled', 1); + break; + + case 'docx': // Microsoft Word 2007/2008 + case 'dotx': + case 'xlsx': // Microsoft Excel 2007/2008 + case 'xltx': + case 'pptx': // Microsoft PowerPoint + case 'ppsx': + case 'potx': + if ($indexerConfig['unzip']) { + $unzipPath = preg_replace('/\/$/', '', $indexerConfig['unzip']).'/'; + if (ini_get('safe_mode') || @is_file($unzipPath.'unzip'.$exe)) { + $this->app['unzip'] = $unzipPath.'unzip'.$exe; + $extOK = TRUE; + } else $this->pObj->log_setTSlogMessage("'unzip' tool for reading DOCX/XLSX/PPTX files was not found in path '".$unzipPath."unzip'",3); + + } else $this->pObj->log_setTSlogMessage('unzip tool (docx/pptx/xlsx) disabled', 1); + break; + case 'rtf': // Catdoc if ($indexerConfig['unrtf']) { @@ -215,15 +257,29 @@ switch($extension) { case 'pdf': // PDF case 'doc': // MS Word files + case 'dot': // MS Word files case 'pps': // MS PowerPoint case 'ppt': // MS PowerPoint + case 'pot': // MS PowerPoint case 'xls': // MS Excel + case 'xlt': // MS Excel case 'sxc': // Open Office Calc. case 'sxi': // Open Office Impress case 'sxw': // Open Office Writer case 'ods': // Oasis OpenDocument Spreadsheet case 'odp': // Oasis OpenDocument Presentation case 'odt': // Oasis OpenDocument Text + case 'zip': // ZIP + case 'tar': // TAR (TapeARchiver) + case 'tbz': // BZip'ed tarball + case 'tgz': // gzip'ed tarball + case 'docx': // \ + case 'dotx': // \ + case 'pptx': // -- Microsoft Office 2007/2008 + case 'ppsx': // / + case':potx': // / + case 'xlsx': // / + case 'xltx': /// case 'rtf': // RTF documents case 'txt': // ASCII Text documents case 'html': // HTML @@ -231,7 +287,7 @@ case 'csv': // Comma Separated Values case 'xml': // Generic XML case 'jpg': // Jpeg images (EXIF comment) - case 'jpeg': // Jpeg images (EXIF comment) + case 'jpeg': // Jpeg images (EXIF comment) case 'tif': // TIF images (EXIF comment) return TRUE; break; @@ -264,6 +320,9 @@ } break; case 'doc': + case 'dot': + case 'docx': + case 'dotx': // Catdoc if ($indexerConfig['catdoc']) { return 'MS Word'; @@ -271,12 +330,19 @@ break; case 'pps': // MS PowerPoint(?) case 'ppt': // MS PowerPoint - // ppthtml + case 'pot': // MS PowerPoint + case 'pptx': // MS PowerPoint + case 'ppsx': // ... + case 'potx': // + // ppthtml if ($indexerConfig['ppthtml']) { return 'MS Powerpoint'; } break; case 'xls': // MS Excel + case 'xlt': // MS Excel + case 'xlsx': // MS Excel + case 'xltx': // ... // Xlhtml if ($indexerConfig['xlhtml']) { return 'MS Excel'; @@ -292,6 +358,18 @@ return 'Open Office'; } break; + case 'zip': // ZIP + if ($indexerConfig['unzip']) { + return 'ZIP'; + } + break; + case 'tar': // TAR + case 'tgz': // gzip'ed TAR + case 'tbz': // bzip'ed TAR + if ($indexerConfig['tar']) { + return 'tarball'; + } + break; case 'rtf': // Catdoc if ($indexerConfig['unrtf']) { @@ -355,7 +433,9 @@ unset($contentArr); // Return immediately if initialization didn't set support up: - if (!$this->supportedExtensions[$ext]) return FALSE; + if (!$this->supportedExtensions[$ext]) { + return FALSE; + } // Switch by file extension switch ($ext) { @@ -372,7 +452,7 @@ // Get pdf content: $tempFileName = t3lib_div::tempnam('Typo3_indexer'); // Create temporary name @unlink ($tempFileName); // Delete if exists, just to be safe. - $cmd = $this->app['pdftotext'] . ' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName; + $cmd = $this->app['pdftotext'].' -f ' . $low . ' -l ' . $high . ' -enc UTF-8 -q ' . escapeshellarg($absFile) . ' ' . $tempFileName; exec($cmd); if (@is_file($tempFileName)) { $content = t3lib_div::getUrl($tempFileName); @@ -387,6 +467,7 @@ } break; case 'doc': + case 'dot': if ($this->app['catdoc']) { $cmd = $this->app['catdoc'] . ' -d utf-8 ' . escapeshellarg($absFile); exec($cmd,$res); @@ -397,6 +478,7 @@ break; case 'pps': case 'ppt': + case 'pot': if ($this->app['ppthtml']) { $cmd = $this->app['ppthtml'] . ' ' . escapeshellarg($absFile); exec($cmd,$res); @@ -408,6 +490,7 @@ } break; case 'xls': + case 'xlt': if ($this->app['xlhtml']) { $cmd = $this->app['xlhtml'] . ' -nc -te ' . escapeshellarg($absFile); exec($cmd,$res); @@ -457,6 +540,141 @@ } } break; + + case 'docx': + case 'dotx': + case 'xlsx': + case 'xltx': + case 'pptx': + case 'ppsx': + case 'potx': + if($this->app['unzip']) { + $fb = FirePHP::getInstance(true); + + // content + switch($ext) { + case 'docx': + case 'dotx': + $cmd = $this->app['unzip'] . ' -p '. escapeshellarg($absFile) . ' word/document.xml'; + break; + case 'xlsx': + case 'xltx': + $cmd = $this->app['unzip'] . ' -p '. escapeshellarg($absFile) . ' xl/worksheets/sheet1.xml'; + break; + case 'ppsx': + case 'pptx': + case 'potx': + $cmd = $this->app['unzip'] . ' -p '. escapeshellarg($absFile) . ' ppt/slides/slide1.xml'; + break; + } + exec($cmd,$res); + $content_xml = implode(chr(10),$res); + $fb->log($content_xml); + unset($res); + + $utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml))); + $contentArr = $this->pObj->splitRegularContent($utf8_content); + + // Metainformations: + $cmd = $this->app['unzip'] . ' -p ' . escapeshellarg($absFile) . ' docProps/core.xml'; + exec($cmd,$res); + $core_xml = implode(chr(10),$res); + unset($res); + + $coreContent = t3lib_div::xml2tree($core_xml); + $contentArr['title'] = basename($absFile); + $contentArr['title'] .= (string) ' '.$coreContent['cp:coreProperties'][0]['ch']['dc:title'][0]['values'][0]; + $contentArr['description'] = (string) $coreContent['cp:coreProperties'][0]['ch']['dc:subject'][0]['values'][0]; + $contentArr['description'] .= ' '.$coreContent['cp:coreProperties'][0]['ch']['dc:description'][0]['values'][0]; + $contentArr['description'] .= $coreContent['cp:coreProperties'][0]['ch']['dc:creator'][0]['values'][0]; + + $contentArr['keywords'] .= (string) $coreContent['cp:coreProperties'][0]['ch']['cp:keywords'][0]['values'][0]; + + } + break; + + + + case 'zip': + if ($this->app['unzip']) { + // Extract files to temp-dir + $cmd = $this->app['unzip'].' -q -d '.PATH_site.'typo3temp/ ' . escapeshellarg($absFile); + exec($cmd, $res); + // Get the filenames, we know the path: + $cmd = $this->app['unzip'].' -Z -1 ' . escapeshellarg($absFile); + exec($cmd, $res); + $contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile); + foreach($res as $filename) { + $ext = pathinfo($filename); + $fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0); + $contentArr['title'] .= ' '.$fileContent['title']; + $contentArr['description'] .= ' '.$fileContent['description']; + $contentArr['keywords'] .= ' '.$fileContent['keywords']; + $contentArr['body'] .= ' '.$fileContent['body']; + + unlink(PATH_site.'typo3temp/'.$filename); + } + + } + break; + + case 'tar': + if ($this->app['tar']) { + // Extract files to temp-dir and get the extracted filenames... + $cmd = $this->app['tar'].' xvf ' . escapeshellarg($absFile) . ' -C '.PATH_site.'typo3temp/'; + exec($cmd, $res); + $contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile); + foreach($res as $filename) { + $ext = pathinfo($filename); + $fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0); + $contentArr['title'] .= ' '.$fileContent['title']; + $contentArr['description'] .= ' '.$fileContent['description']; + $contentArr['keywords'] .= ' '.$fileContent['keywords']; + $contentArr['body'] .= ' '.$fileContent['body']; + + unlink(PATH_site.'typo3temp/'.$filename); + } + } + break; + + case 'tgz': + if ($this->app['tar']) { + // Extract files to temp-dir and get the extracted filenames... + $cmd = $this->app['tar'].' xzvf ' . escapeshellarg($absFile) . ' -C '.PATH_site.'typo3temp/'; + exec($cmd, $res); + $contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile); + foreach($res as $filename) { + $ext = pathinfo($filename); + $fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0); + $contentArr['title'] .= ' '.$fileContent['title']; + $contentArr['description'] .= ' '.$fileContent['description']; + $contentArr['keywords'] .= ' '.$fileContent['keywords']; + $contentArr['body'] .= ' '.$fileContent['body']; + + unlink(PATH_site.'typo3temp/'.$filename); + } + } + break; + + case 'tbz': + if ($this->app['tar']) { + // Extract files to temp-dir and get the extracted filenames... + $cmd = $this->app['tar'].' xjvf ' . escapeshellarg($absFile) . ' -C '.PATH_site.'typo3temp/'; + exec($cmd, $res); + $contentArr['title'] = preg_replace("/.*\/(.[^\/]*)/", "\\1", $absFile); + foreach($res as $filename) { + $ext = pathinfo($filename); + $fileContent = $this->readFileContent($ext['extension'], '/tmp/'.utf8_encode($filename), 0); + $contentArr['title'] .= ' '.$fileContent['title']; + $contentArr['description'] .= ' '.$fileContent['description']; + $contentArr['keywords'] .= ' '.$fileContent['keywords']; + $contentArr['body'] .= ' '.$fileContent['body']; + + unlink(PATH_site.'typo3temp/'.$filename); + } + } + break; + case 'rtf': if ($this->app['unrtf']) { $cmd = $this->app['unrtf'] . ' ' . escapeshellarg($absFile); @@ -535,7 +753,7 @@ switch ($ext) { case 'pdf': // Getting pdf-info: - $cmd = $this->app['pdfinfo'] . ' ' . escapeshellarg($absFile); + $cmd = $this->app['pdfinfo'].' ' . escapeshellarg($absFile); exec($cmd,$res); $pdfInfo = $this->splitPdfInfo($res); unset($res); @@ -626,4 +844,4 @@ if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']) { include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']); } -?> \ No newline at end of file +?> Property changes on: class.external_parser.php ___________________________________________________________________ Name: svn:executable +