class.external_parser.php - TYPO3 Core - TYPO3 Forge

Bug #14804 » class.external_parser.php

Administrator Admin, 2005-06-08 15:09

    
    <?php

    /***************************************************************

    *  Copyright notice

    *

    *  (c) 2001-2005 Kasper Skaarhoj (kasperYYYY@typo3.com)

    *  All rights reserved

    *

    *  This script is part of the TYPO3 project. The TYPO3 project is

    *  free software; you can redistribute it and/or modify

    *  it under the terms of the GNU General Public License as published by

    *  the Free Software Foundation; either version 2 of the License, or

    *  (at your option) any later version.

    *

    *  The GNU General Public License can be found at

    *  http://www.gnu.org/copyleft/gpl.html.

    *  A copy is found in the textfile GPL.txt and important notices to the license

    *  from the author is found in LICENSE.txt distributed with these scripts.

    *

    *

    *  This script is distributed in the hope that it will be useful,

    *  but WITHOUT ANY WARRANTY; without even the implied warranty of

    *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    *  GNU General Public License for more details.

    *

    *  This copyright notice MUST APPEAR in all copies of the script!

    ***************************************************************/

    /**

     * External standard parsers for indexed_search

     *

     * @author	Kasper Sk?rh?j <kasperYYYY@typo3.com>

     * @coauthor	Olivier Simah <noname_paris@yahoo.fr>

     */

    /**

     * [CLASS/FUNCTION INDEX of SCRIPT]

     *

     *

     *

     *   75: class tx_indexed_search_extparse

     *   94:     function initParser($extension)

     *  227:     function softInit($extension)

     *  257:     function searchTypeMediaTitle($extension)

     *  330:     function isMultiplePageExtension($extension)

     *

     *              SECTION: Reading documents (for parsing)

     *  361:     function readFileContent($ext,$absFile,$cPKey)

     *  541:     function fileContentParts($ext,$absFile)

     *  580:     function splitPdfInfo($pdfInfoArray)

     *  599:     function removeEndJunk($string)

     *

     *              SECTION: Backend analyzer

     *  626:     function getIcon($extension)

     *

     * TOTAL FUNCTIONS: 9

     * (This index is automatically created/updated by the extension "extdeveval")

     *

     */

    /**

     * External standard parsers for indexed_search

     * MUST RETURN utf-8 content!

     *

     * @author	Kasper Skaarhoj <kasperYYYY@typo3.com>

     * @package TYPO3

     * @subpackage tx_indexedsearch

     */

    class tx_indexed_search_extparse {

    		// This value is also overridden from config.

    	var $pdf_mode = -20;	// zero: whole PDF file is indexed in one. positive value: Indicates number of pages at a time, eg. "5" would means 1-5,6-10,.... Negative integer would indicate (abs value) number of groups. Eg "3" groups of 10 pages would be 1-4,5-8,9-10

    		// This array is configured in initialization:

    	var $app = array();

    	var $ext2itemtype_map = array();

    	var $supportedExtensions = array();

    	var $pObj;		// Reference to parent object (indexer class)

    	/**

    	 * Initialize external parser for parsing content.

    	 *

    	 * @param	string		File extension

    	 * @return	boolean		Returns true if extension is supported/enabled, otherwise false.

    	 */

    	function initParser($extension)	{

    			// Then read indexer-config and set if appropriate:

    		$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);

    			// If windows, apply extension to tool name:

    		$exe = (TYPO3_OS == 'WIN') ? '.exe' : ''; // lg

    		$extOK = FALSE;

    		$mainExtension = '';

    			// Ignore extensions

    		$ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);

    		if (in_array($extension, $ignoreExtensions))	{

    			$this->pObj->log_setTSlogMessage('Extension "'.$extension.'" was set to be ignored.',1);

    			return FALSE;

    		}

    			// Switch on file extension:

    		switch($extension)	{

    			case 'pdf':

    					// PDF

    				if ($indexerConfig['pdftools'])	{

    					$pdfPath = ereg_replace("\/$",'',$indexerConfig['pdftools']).'/';

    					if (ini_get('safe_mode') || (@is_file($pdfPath.'pdftotext'.$exe) && @is_file($pdfPath.'pdfinfo'.$exe)))	{

    						$this->app['pdfinfo'] = $pdfPath.'pdfinfo'.$exe;

    						$this->app['pdftotext'] = $pdfPath.'pdftotext'.$exe;

    							// PDF mode:

    						$this->pdf_mode = t3lib_div::intInRange($indexerConfig['pdf_mode'],-100,100);

    						$extOK = TRUE;

    					} else $this->pObj->log_setTSlogMessage("PDF tools was not found in paths '".$pdfPath."pdftotext' and/or '".$pdfPath."pdfinfo'",3);

    				} else $this->pObj->log_setTSlogMessage('PDF tools disabled',1);

    			break;

    			case 'doc':

    					// Catdoc

    				if ($indexerConfig['catdoc'])	{

    					$catdocPath = ereg_replace("\/$",'',$indexerConfig['catdoc']).'/';

    					if (ini_get('safe_mode') || @is_file($catdocPath.'catdoc'.$exe))	{

    						$this->app['catdoc'] = $catdocPath.'catdoc'.$exe;

    						$extOK = TRUE;

    					} else $this->pObj->log_setTSlogMessage("'catdoc' tool for reading Word-files was not found in paths '".$catdocPath."catdoc'",3);

    				} else $this->pObj->log_setTSlogMessage('catdoc tools (Word-files) disabled',1);

    			break;

    			case 'pps':		// MS PowerPoint(?)

    			case 'ppt':		// MS PowerPoint

    					// ppthtml

    				if ($indexerConfig['ppthtml'])	{

    					$ppthtmlPath = ereg_replace('\/$','',$indexerConfig['ppthtml']).'/';

    					if (ini_get('safe_mode') || @is_file($ppthtmlPath.'ppthtml'.$exe)){

    						$this->app['ppthtml'] = $ppthtmlPath.'ppthtml'.$exe;

    						$extOK = TRUE;

    					} else $this->pObj->log_setTSlogMessage("'ppthtml' tool for reading Powerpoint-files was not found in paths '".$ppthtmlPath."ppthtml'",3);

    				} else $this->pObj->log_setTSlogMessage('ppthtml tools (Powerpoint-files) disabled',1);

    			break;

    			case 'xls':		// MS Excel

    					// Xlhtml

    				if ($indexerConfig['xlhtml'])	{

    					$xlhtmlPath = ereg_replace('\/$','',$indexerConfig['xlhtml']).'/';

    					if (ini_get('safe_mode') || @is_file($xlhtmlPath.'xlhtml'.$exe)){

    						$this->app['xlhtml'] = $xlhtmlPath.'xlhtml'.$exe;

    						$extOK = TRUE;

    					} else $this->pObj->log_setTSlogMessage("'xlhtml' tool for reading Excel-files was not found in paths '".$xlhtmlPath."xlhtml'",3);

    				} else $this->pObj->log_setTSlogMessage('xlhtml tools (Excel-files) disabled',1);

    			break;

    			case 'sxc':		// Open Office Calc.

    			case 'sxi':		// Open Office Impress

    			case 'sxw':		// Open Office Writer

    					// ooo_extract.rb can be found at: http://www.math.umd.edu/~dcarrera/openoffice/misc/tools/ooo_extract.html

    					// I had to run this on debian before I could run the ooo_extract.rb script:

    					//		apt-get install libzlib-ruby1.8

    					//		apt-get install librexml-ruby1.8

    					// ruby + ooo_extract

    				if ($indexerConfig['nativeOOMethod'])	{

    					if (t3lib_extMgm::isLoaded('libunzipped'))	{

    						$this->app['nativeOOMethod'] = TRUE;

    						$extOK = TRUE;

    						$this->pObj->log_setTSlogMessage('Using "libunzipped" for extraction of Open Office files, "'.$extension.'".',1);

    					} else $this->pObj->log_setTSlogMessage('The extension "libunzipped" was not loaded (for extraction of Open Office files, "'.$extension.'")',2);

    				} else {

    					if ($indexerConfig['OOoExtract'])	{

    						if($indexerConfig['ruby'])	{ $rubyPath = ereg_replace('\/$','',$indexerConfig['ruby']).'/'; }

    						$oooExPath = ereg_replace('\/$','',$indexerConfig['OOoExtract']).'/';

    						if (ini_get('safe_mode') || (($rubyPath ? @is_file($rubyPath.'ruby'.$exe) : true) && @is_file($oooExPath.'ooo_extract.rb')))	{

    							$this->app['ruby'] = $rubyPath.'ruby'.$exe;

    							$this->app['OOo'] = $oooExPath.'ooo_extract.rb';

    							$extOK = TRUE;

    						} else $this->pObj->log_setTSlogMessage("'Ruby and OOo_extract' tools for reading OpenOffice.org documents were not found in paths '".$rubyPath."ruby".$exe."' OR '".$oooExPath."ooo_extract.rb'",3);

    					} else $this->pObj->log_setTSlogMessage('Ruby & OOo_extract tools (OpenOffice-files) disabled',1);

    				}

    			break;

    			case 'rtf':

    					// Catdoc

    				if ($indexerConfig['unrtf'])	{

    					$unrtfPath = ereg_replace("\/$",'',$indexerConfig['unrtf']).'/';

    					if (ini_get('safe_mode') || @is_file($unrtfPath.'unrtf'.$exe))	{

    						$this->app['unrtf'] = $unrtfPath.'unrtf'.$exe;

    						$extOK = TRUE;

    					} else $this->pObj->log_setTSlogMessage("'unrtf' tool for reading RTF-files was not found in paths '".$unrtfPath."unrtf'",3);

    				} else $this->pObj->log_setTSlogMessage('unrtf tool (RTF-files) disabled',1);

    			break;

    			case 'txt':		// Raw text

    			case 'csv':		// Raw text

    			case 'xml':		// PHP strip-tags()

    			case 'tif':		// PHP EXIF

    				$extOK = TRUE;

    			break;

    			case 'html':	// PHP strip-tags()

    			case 'htm':		// PHP strip-tags()

    				$extOK = TRUE;

    				$mainExtension = 'html';	// making "html" the common "item_type"

    			break;

    			case 'jpg':		// PHP EXIF

    			case 'jpeg':	// PHP EXIF

    				$extOK = TRUE;

    				$mainExtension = 'jpeg';	// making "jpeg" the common item_type

    			break;

    		}

    			// If extension was OK:

    		if ($extOK)	{

    			$this->supportedExtensions[$extension] = TRUE;

    			$this->ext2itemtype_map[$extension] = $mainExtension ? $mainExtension : $extension;

    			return TRUE;

    		}

    	}

    	/**

    	 * Initialize external parser for backend modules

    	 * Doesn't evaluate if parser is configured right - more like returning POSSIBLE supported extensions (for showing icons etc) in backend and frontend plugin

    	 *

    	 * @param	string		File extension to initialize for.

    	 * @return	boolean		Returns true if the extension is supported and enabled, otherwise false.

    	 */

    	function softInit($extension)	{

    		switch($extension)	{

    			case 'pdf':		// PDF

    			case 'doc':		// MS Word files

    			case 'pps':		// MS PowerPoint

    			case 'ppt':		// MS PowerPoint

    			case 'xls':		// MS Excel

    			case 'sxc':		// Open Office Calc.

    			case 'sxi':		// Open Office Impress

    			case 'sxw':		// Open Office Writer

    			case 'rtf':		// RTF documents

    			case 'txt':		// ASCII Text documents

    			case 'html':	// HTML

    			case 'htm':		// HTML

    			case 'csv':		// Comma Separated Values

    			case 'xml':		// Generic XML

    			case 'jpg':		// Jpeg images (EXIF comment)

    			case 'jpeg':	// Jpeg images (EXIF comment)

    			case 'tif':		// TIF images (EXIF comment)

    				return TRUE;

    			break;

    		}

    	}

    	/**

    	 * Return title of entry in media type selector box.

    	 *

    	 * @param	string		File extension

    	 * @return	string		String with label value of entry in media type search selector box (frontend plugin).

    	 */

    	function searchTypeMediaTitle($extension)	{

    			// Read indexer-config

    		$indexerConfig = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['indexed_search']);

    			// Ignore extensions

    		$ignoreExtensions = t3lib_div::trimExplode(',', strtolower($indexerConfig['ignoreExtensions']),1);

    		if (in_array($extension, $ignoreExtensions))	{

    			return FALSE;

    		}

    			// Switch on file extension:

    		switch($extension)	{

    			case 'pdf':

    					// PDF

    				if ($indexerConfig['pdftools'])	{

    					return 'PDF';

    				}

    			break;

    			case 'doc':

    					// Catdoc

    				if ($indexerConfig['catdoc'])	{

    					return 'MS Word';

    				}

    			break;

    			case 'pps':		// MS PowerPoint(?)

    			case 'ppt':		// MS PowerPoint

    					// ppthtml

    				if ($indexerConfig['ppthtml'])	{

    					return 'MS Powerpoint';

    				}

    			break;

    			case 'xls':		// MS Excel

    					// Xlhtml

    				if ($indexerConfig['xlhtml'])	{

    					return 'MS Excel';

    				}

    			break;

    			case 'sxc':		// Open Office Calc.

    			case 'sxi':		// Open Office Impress

    			case 'sxw':		// Open Office Writer

    				if ($indexerConfig['nativeOOMethod'] || $indexerConfig['ruby'])	{

    					return 'Open Office';

    				}

    			break;

    			case 'rtf':

    					// Catdoc

    				if ($indexerConfig['unrtf'])	{

    					return 'RTF';

    				}

    			break;

    			case 'html':	// PHP strip-tags()

    			case 'jpeg':	// PHP EXIF

    			case 'txt':		// Raw text

    			case 'csv':		// Raw text

    			case 'xml':		// PHP strip-tags()

    			case 'tif':		// PHP EXIF

    				return strtoupper($extension);

    			break;

    				// NO entry (duplicates or blank):

    			case 'htm':		// PHP strip-tags()

    			case 'jpg':		// PHP EXIF

    			default:

    			break;

    		}

    	}

    	/**

    	 * Returns true if the input extension (item_type) is a potentially a multi-page extension

    	 *

    	 * @param	string		Extension / item_type string

    	 * @return	boolean		Return true if multi-page

    	 */

    	function isMultiplePageExtension($extension)	{

    			// Switch on file extension:

    		switch((string)$extension)	{

    			case 'pdf':

    				return TRUE;

    			break;

    		}

    	}

    	/************************

    	 *

    	 * Reading documents (for parsing)

    	 *

    	 ************************/

    	/**

    	 * Reads the content of an external file being indexed.

    	 *

    	 * @param	string		File extension, eg. "pdf", "doc" etc.

    	 * @param	string		Absolute filename of file (must exist and be validated OK before calling function)

    	 * @param	string		Pointer to section (zero for all other than PDF which will have an indication of pages into which the document should be splitted.)

    	 * @return	array		Standard content array (title, description, keywords, body keys)

    	 */

    	function readFileContent($ext,$absFile,$cPKey)	{

    		unset($contentArr);

    			// Return immediately if initialization didn't set support up:

    		if (!$this->supportedExtensions[$ext])	return FALSE;

    			// Switch by file extension

    		switch ($ext)	{

    			case 'pdf':

    				if ($this->app['pdfinfo'])	{

    						// Getting pdf-info:

              // JOH: added double quotes which seem to work properly under windows

              //      and linux.. We need this, because there might be spaces in filenames

    					$cmd = $this->app['pdfinfo'].' "'.$absFile.'"';

    					exec($cmd,$res);

    					$pdfInfo = $this->splitPdfInfo($res);

    					if (intval($pdfInfo['pages']))	{

    						list($low,$high) = explode('-',$cPKey);

    							// Get pdf content:

    						$tempFileName = t3lib_div::tempnam('Typo3_indexer');		// Create temporary name

    						@unlink ($tempFileName);	// Delete if exists, just to be safe.

                // JOH: added double quotes

    						$cmd = $this->app['pdftotext'].' -f '.$low.' -l '.$high.' -enc UTF-8 -q "'.$absFile.'" "'.$tempFileName.'"';

    						exec($cmd,$res);

    						if (@is_file($tempFileName))	{

    							$content = t3lib_div::getUrl($tempFileName);

    							unlink($tempFileName);

    						} else {

    							$this->pObj->log_setTSlogMessage('PDFtoText Failed on this document: '.$absFile.". Maybe the PDF file is locked for printing or encrypted.",2);

    						}

    						$contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));

    					}

    				}

    			break;

    			case 'doc':

    				if ($this->app['catdoc'])	{

              // JOH: added double quotes around the filename

    					$cmd = $this->app['catdoc'].' -d utf-8 "'.$absFile.'"';

    					exec($cmd,$res);

    					$content = implode(chr(10),$res);

    					$contentArr = $this->pObj->splitRegularContent($this->removeEndJunk($content));

    				}

    			break;

    			case 'pps':

    			case 'ppt':

    				if ($this->app['ppthtml'])	{

    					$cmd = $this->app['ppthtml'].' "'.$absFile.'"';

    					exec($cmd,$res);

    					$content = implode(chr(10),$res);

    					$content = $this->pObj->convertHTMLToUtf8($content);

    					$contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));

    					$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!

    				}

    			break;

    			case 'xls':

    				if ($this->app['xlhtml'])	{

    					$cmd = $this->app['xlhtml'].' -nc -te "'.$absFile.'"';

    					exec($cmd,$res);

    					$content = implode(chr(10),$res);

    					$content = $this->pObj->convertHTMLToUtf8($content);

    					$contentArr = $this->pObj->splitHTMLContent($this->removeEndJunk($content));

    					$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!

    				}

    			break;

    			case 'sxi':

    			case 'sxc':

    			case 'sxw':

    				if ($this->app['nativeOOMethod'])	{

    					if (t3lib_extMgm::isLoaded('libunzipped'))	{

    						global $TYPO3_CONF_VARS;

    						require_once(t3lib_extMgm::extPath('libunzipped').'class.tx_libunzipped.php');

    							// Initialize Unzip object:

    						$unzip = t3lib_div::makeInstance('tx_libunzipped');

    						$ooFiles = $unzip->init($absFile);

    						if (is_array($ooFiles))	{

    								// Read content.xml:

    							$content_xml = $unzip->getFileFromArchive('content.xml');

    							$meta_xml = $unzip->getFileFromArchive('meta.xml');

    							$utf8_content = trim(strip_tags(str_replace('<',' <',$content_xml['content'])));

    							$contentArr = $this->pObj->splitRegularContent($utf8_content);

    							$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!

    								// Meta information

    							$metaContent = t3lib_div::xml2tree($meta_xml['content']);

    							$metaContent = $metaContent['office:document-meta'][0]['ch']['office:meta'][0]['ch'];

    							if (is_array($metaContent))	{

    								$contentArr['title'] = $metaContent['dc:title'][0]['values'][0] ? $metaContent['dc:title'][0]['values'][0] : $contentArr['title'];

    								$contentArr['description'] = $metaContent['dc:subject'][0]['values'][0].' '.$metaContent['dc:description'][0]['values'][0];

    									// Keywords collected:

    								if (is_array($metaContent['meta:keywords'][0]['ch']['meta:keyword']))	{

    									foreach($metaContent['meta:keywords'][0]['ch']['meta:keyword'] as $kwDat)	{

    										$contentArr['keywords'].= $kwDat['values'][0].' ';

    									}

    								}

    							}

    						}

    					}

    				} else {

    					if ($this->app['ruby'])	{

    							// Extracting document headers:

    						$cmd = $this->app['ruby'].' '.$this->app['OOo'].' --heading "'.$absFile.'"';

    						exec($cmd,$headings);

    							// Extracting document text:

    						$cmd = $this->app['ruby'].' '.$this->app['OOo'].' "'.$absFile.'"';

    						exec($cmd,$texts);

    						$content = implode(chr(10),$headings).' '.implode(chr(10),$texts);

    						$contentArr = $this->pObj->splitRegularContent($content);

    						$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!

    					}

    				}

    			break;

    			case 'rtf':

    				if ($this->app['unrtf'])	{

    					$cmd = $this->app['unrtf'].' "'.$absFile.'"';

    					exec($cmd,$res);

    					$fileContent = implode(chr(10),$res);

    					$fileContent = $this->pObj->convertHTMLToUtf8($fileContent);

    					$contentArr = $this->pObj->splitHTMLContent($fileContent);

    				}

    			break;

    			case 'txt':

    			case 'csv':		// Raw text

    				$content = t3lib_div::getUrl($absFile);

    					// TODO: Auto-registration of charset???? -> utf-8 (Current assuming western europe...)

    				$content = $this->pObj->convertHTMLToUtf8($content, 'iso-8859-1');

    				$contentArr = $this->pObj->splitRegularContent($content);

    				$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!

    			break;

    			case 'html':

    			case 'htm':

    				$fileContent = t3lib_div::getUrl($absFile);

    				$fileContent = $this->pObj->convertHTMLToUtf8($fileContent);

    				$contentArr = $this->pObj->splitHTMLContent($fileContent);

    			break;

    			case 'xml':		// PHP strip-tags()

    				$fileContent = t3lib_div::getUrl($absFile);

    					// Finding charset:

    				eregi('^[[:space:]]*<\?xml[^>]+encoding[[:space:]]*=[[:space:]]*["\'][[:space:]]*([[:alnum:]_-]+)[[:space:]]*["\']',substr($fileContent,0,200),$reg);

    				$charset = $reg[1] ? $this->pObj->csObj->parse_charset($reg[1]) : 'utf-8';

    					// Converting content:

    				$fileContent = $this->pObj->convertHTMLToUtf8(strip_tags(str_replace('<',' <',$fileContent)), $charset);

    				$contentArr = $this->pObj->splitRegularContent($fileContent);

    				$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!

    			break;

    			case 'jpg':		// PHP EXIF

    			case 'jpeg':	// PHP EXIF

    			case 'tif':		// PHP EXIF

    				$exif = exif_read_data($absFile, 'IFD0');

    				if ($exif)	{

    					$comment = trim($exif['COMMENT'][0].' '.$exif['ImageDescription']);	// The comments in JPEG files are utf-8, while in Tif files they are 7-bit ascii.

    				} else {

    					$comment = '';

    				}

    				$contentArr = $this->pObj->splitRegularContent($comment);

    				$contentArr['title'] = basename($absFile);	// Make sure the title doesn't expose the absolute path!

    			break;

    			default:

    				return false;

    			break;

    		}

    			// If no title (and why should there be...) then the file-name is set as title. This will raise the hits considerably if the search matches the document name.

    		if (is_array($contentArr) && !$contentArr['title'])	{

    			$contentArr['title'] = str_replace('_',' ',basename($absFile));	// Substituting "_" for " " because many filenames may have this instead of a space char.

    		}

    		return $contentArr;

    	}

    	/**

    	 * Creates an array with pointers to divisions of document.

    	 * ONLY for PDF files at this point. All other types will have an array with a single element with the value "0" (zero) coming back.

    	 *

    	 * @param	string		File extension

    	 * @param	string		Absolute filename (must exist and be validated OK before calling function)

    	 * @return	array		Array of pointers to sections that the document should be divided into

    	 */

    	function fileContentParts($ext,$absFile)	{

    		$cParts = array(0);

    		switch ($ext)	{

    			case 'pdf':

    					// Getting pdf-info:

    				$cmd = $this->app['pdfinfo'].' "'.$absFile.'"';

    				exec($cmd,$res);

    				$pdfInfo = $this->splitPdfInfo($res);

    				if (intval($pdfInfo['pages']))	{

    					$cParts = array();

    						// Calculate mode

    					if ($this->pdf_mode>0)	{

    						$iter = ceil($pdfInfo['pages']/$this->pdf_mode);

    					} else {

    						$iter = t3lib_div::intInRange(abs($this->pdf_mode),1,$pdfInfo['pages']);

    					}

    						// Traverse and create intervals.

    					for ($a=0;$a<$iter;$a++)	{

    						$low = floor($a*($pdfInfo['pages']/$iter))+1;

    						$high = floor(($a+1)*($pdfInfo['pages']/$iter));

    						$cParts[] = $low.'-'.$high;

    					}

    				}

    			break;

    		}

    		return $cParts;

    	}

    	/**

    	 * Analysing PDF info into a useable format.

    	 *

    	 * @param	array		Array of PDF content, coming from the pdfinfo tool

    	 * @return	array		Result array

    	 * @access private

    	 * @see fileContentParts()

    	 */

    	function splitPdfInfo($pdfInfoArray)	{

    		$res = array();

    		if (is_array($pdfInfoArray))	{

    			foreach($pdfInfoArray as $line)	{

    				$parts = explode(':',$line,2);

    				if (count($parts)>1 && trim($parts[0]))	{

    					$res[strtolower(trim($parts[0]))] = trim($parts[1]);

    				}

    			}

    		}

    		return $res;

    	}

    	/**

    	 * Removes some strange char(12) characters and line breaks that then to occur in the end of the string from external files.

    	 *

    	 * @param	string		String to clean up

    	 * @return	string		String

    	 */

    	function removeEndJunk($string)	{

    		return trim(ereg_replace('['.chr(10).chr(12).']*$','',$string));

    	}

    	/************************

    	 *

    	 * Backend analyzer

    	 *

    	 ************************/

    	/**

    	 * Return icon for file extension

    	 *

    	 * @param	string		File extension, lowercase.

    	 * @return	string		Relative file reference, resolvable by t3lib_div::getFileAbsFileName()

    	 */

    	function getIcon($extension)	{

    		if ($extension=='htm')	$extension = 'html';

    		if ($extension=='jpeg')	$extension = 'jpg';

    		return 'EXT:indexed_search/pi/res/'.$extension.'.gif';

    	}

    }

    if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php'])    {

        include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/indexed_search/class.external_parser.php']);

    }

    ?>

« Previous
1
2
Next »

(1-1/2)

Project

General

Profile

TYPO3 Core

Bug #14804 » class.external_parser.php