Project

General

Profile

Bug #22410 » 14050_cleaning_t3lib_cs.patch

Administrator Admin, 2010-11-24 10:30

View differences:

t3lib/class.t3lib_cs.php (revision )
<?php
/***************************************************************
* Copyright notice
*
* (c) 2003-2010 Kasper Sk?rh?j (kasperYYYY@typo3.com)
* All rights reserved
*
* This script is part of the Typo3 project. The Typo3 project is
* free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* The GNU General Public License can be found at
* http://www.gnu.org/copyleft/gpl.html.
*
* This script is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* This copyright notice MUST APPEAR in all copies of the script!
***************************************************************/
* Copyright notice
*
* (c) 2003-2010 Kasper Sk?rh?j (kasperYYYY@typo3.com)
* All rights reserved
*
* This script is part of the Typo3 project. The Typo3 project is
* free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* The GNU General Public License can be found at
* http://www.gnu.org/copyleft/gpl.html.
*
* This script is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* This copyright notice MUST APPEAR in all copies of the script!
***************************************************************/
/**
* Class for conversion between charsets.
*
......
*
*
* 136: class t3lib_cs
* 488: function parse_charset($charset)
* 488: function parse_charset($charset)
* 507: function get_locale_charset($locale)
* 507: function get_locale_charset($locale)
*
* SECTION: Charset Conversion functions
* SECTION: Charset Conversion functions
* 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
* 560: function conv($str,$fromCS,$toCS,$useEntityForNoChar=0)
* 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
* 600: function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0)
* 617: function utf8_encode($str,$charset)
* 617: function utf8_encode($str,$charset)
* 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
* 663: function utf8_decode($str,$charset,$useEntityForNoChar=0)
* 706: function utf8_to_entities($str)
* 706: function utf8_to_entities($str)
* 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
* 739: function entities_to_utf8($str,$alsoStdHtmlEnt=0)
* 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
* 773: function utf8_to_numberarray($str,$convEntities=0,$retChar=0)
* 823: function UnumberToChar($cbyte)
* 823: function UnumberToChar($cbyte)
* 868: function utf8CharToUnumber($str,$hex=0)
* 868: function utf8CharToUnumber($str,$hex=0)
*
* SECTION: Init functions
* SECTION: Init functions
* 911: function initCharset($charset)
* 911: function initCharset($charset)
* 973: function initUnicodeData($mode=null)
* 973: function initUnicodeData($mode=null)
* 1198: function initCaseFolding($charset)
* 1198: function initCaseFolding($charset)
* 1260: function initToASCII($charset)
* 1260: function initToASCII($charset)
*
* SECTION: String operation functions
* SECTION: String operation functions
* 1331: function substr($charset,$string,$start,$len=null)
* 1331: function substr($charset,$string,$start,$len=null)
* 1384: function strlen($charset,$string)
* 1384: function strlen($charset,$string)
* 1414: function crop($charset,$string,$len,$crop='')
* 1414: function crop($charset,$string,$len,$crop='')
* 1467: function strtrunc($charset,$string,$len)
* 1467: function strtrunc($charset,$string,$len)
* 1501: function conv_case($charset,$string,$case)
* 1501: function conv_case($charset,$string,$case)
* 1527: function specCharsToASCII($charset,$string)
* 1527: function specCharsToASCII($charset,$string)
*
* SECTION: Internal string operation functions
* SECTION: Internal string operation functions
* 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
* 1567: function sb_char_mapping($str,$charset,$mode,$opt='')
*
* SECTION: Internal UTF-8 string operation functions
* SECTION: Internal UTF-8 string operation functions
* 1622: function utf8_substr($str,$start,$len=null)
* 1622: function utf8_substr($str,$start,$len=null)
* 1655: function utf8_strlen($str)
* 1655: function utf8_strlen($str)
* 1676: function utf8_strtrunc($str,$len)
* 1676: function utf8_strtrunc($str,$len)
* 1698: function utf8_strpos($haystack,$needle,$offset=0)
* 1698: function utf8_strpos($haystack,$needle,$offset=0)
* 1723: function utf8_strrpos($haystack,$needle)
* 1723: function utf8_strrpos($haystack,$needle)
* 1745: function utf8_char2byte_pos($str,$pos)
* 1745: function utf8_char2byte_pos($str,$pos)
* 1786: function utf8_byte2char_pos($str,$pos)
* 1786: function utf8_byte2char_pos($str,$pos)
* 1809: function utf8_char_mapping($str,$mode,$opt='')
* 1809: function utf8_char_mapping($str,$mode,$opt='')
*
* SECTION: Internal EUC string operation functions
* SECTION: Internal EUC string operation functions
* 1885: function euc_strtrunc($str,$len,$charset)
* 1885: function euc_strtrunc($str,$len,$charset)
* 1914: function euc_substr($str,$start,$charset,$len=null)
* 1914: function euc_substr($str,$start,$charset,$len=null)
* 1939: function euc_strlen($str,$charset)
* 1939: function euc_strlen($str,$charset)
* 1966: function euc_char2byte_pos($str,$pos,$charset)
* 1966: function euc_char2byte_pos($str,$pos,$charset)
* 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
* 2007: function euc_char_mapping($str,$charset,$mode,$opt='')
*
* TOTAL FUNCTIONS: 35
* (This index is automatically created/updated by the extension "extdeveval")
......
*/
/**
* Notes on UTF-8
*
......
* @subpackage t3lib
*/
class t3lib_cs {
var $noCharByteVal=63; // ASCII Value for chars with no equivalent.
var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
// This is the array where parsed conversion tables are stored (cached)
var $parsedCharsets=array();
var $parsedCharsets = array();
// An array where case folding data will be stored (cached)
var $caseFolding=array();
var $caseFolding = array();
// An array where charset-to-ASCII mappings are stored (cached)
var $toASCII=array();
var $toASCII = array();
// This tells the converter which charsets has two bytes per char:
var $twoByteSets=array(
var $twoByteSets = array(
'ucs-2'=>1, // 2-byte Unicode
'ucs-2' => 1, // 2-byte Unicode
);
// This tells the converter which charsets has four bytes per char:
var $fourByteSets=array(
var $fourByteSets = array(
'ucs-4'=>1, // 4-byte Unicode
'ucs-4' => 1, // 4-byte Unicode
'utf-32'=>1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
);
// This tells the converter which charsets use a scheme like the Extended Unix Code:
var $eucBasedSets=array(
var $eucBasedSets = array(
'gb2312'=>1, // Chinese, simplified.
'gb2312' => 1, // Chinese, simplified.
'big5'=>1, // Chinese, traditional.
'big5' => 1, // Chinese, traditional.
'euc-kr'=>1, // Korean
'euc-kr' => 1, // Korean
'shift_jis'=>1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
);
// see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
// http://czyborra.com/charsets/iso8859.html
var $synonyms=array(
var $synonyms = array(
'us' => 'ascii',
'us-ascii'=> 'ascii',
'us-ascii' => 'ascii',
'cp819' => 'iso-8859-1',
'ibm819' => 'iso-8859-1',
'iso-ir-100' => 'iso-8859-1',
......
);
// mapping of iso-639-1 language codes to script names
var $lang_to_script=array(
var $lang_to_script = array(
// iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
'ar' => 'arabic',
'bg' => 'cyrillic', // Bulgarian
'bg' => 'cyrillic', // Bulgarian
'bs' => 'east_european', // Bosnian
'bs' => 'east_european', // Bosnian
'cs' => 'east_european', // Czech
'cs' => 'east_european', // Czech
'da' => 'west_european', // Danish
'da' => 'west_european', // Danish
'de' => 'west_european', // German
'de' => 'west_european', // German
'es' => 'west_european', // Spanish
'es' => 'west_european', // Spanish
'et' => 'estonian',
'eo' => 'unicode', // Esperanto
'eo' => 'unicode', // Esperanto
'eu' => 'west_european', // Basque
'eu' => 'west_european', // Basque
'fa' => 'arabic', // Persian
'fa' => 'arabic', // Persian
'fi' => 'west_european', // Finish
'fi' => 'west_european', // Finish
'fo' => 'west_european', // Faroese
'fo' => 'west_european', // Faroese
'fr' => 'west_european', // French
'fr' => 'west_european', // French
'ga' => 'west_european', // Galician
'ga' => 'west_european', // Galician
'ge' => 'unicode', // Georgian
'ge' => 'unicode', // Georgian
'gr' => 'greek',
'he' => 'hebrew', // Hebrew (since 1998)
'he' => 'hebrew', // Hebrew (since 1998)
'hi' => 'unicode', // Hindi
'hi' => 'unicode', // Hindi
'hr' => 'east_european', // Croatian
'hr' => 'east_european', // Croatian
'hu' => 'east_european', // Hungarian
'hu' => 'east_european', // Hungarian
'iw' => 'hebrew', // Hebrew (til 1998)
'iw' => 'hebrew', // Hebrew (til 1998)
'is' => 'west_european', // Icelandic
'is' => 'west_european', // Icelandic
'it' => 'west_european', // Italian
'it' => 'west_european', // Italian
'ja' => 'japanese',
'kl' => 'west_european', // Greenlandic
'kl' => 'west_european', // Greenlandic
'km' => 'unicode', // Khmer
'km' => 'unicode', // Khmer
'ko' => 'korean',
'lt' => 'lithuanian',
'lv' => 'west_european', // Latvian/Lettish
'lv' => 'west_european', // Latvian/Lettish
'nl' => 'west_european', // Dutch
'nl' => 'west_european', // Dutch
'no' => 'west_european', // Norwegian
'no' => 'west_european', // Norwegian
'nb' => 'west_european', // Norwegian Bokmal
'nb' => 'west_european', // Norwegian Bokmal
'nn' => 'west_european', // Norwegian Nynorsk
'nn' => 'west_european', // Norwegian Nynorsk
'pl' => 'east_european', // Polish
'pl' => 'east_european', // Polish
'pt' => 'west_european', // Portuguese
'pt' => 'west_european', // Portuguese
'ro' => 'east_european', // Romanian
'ro' => 'east_european', // Romanian
'ru' => 'cyrillic', // Russian
'ru' => 'cyrillic', // Russian
'sk' => 'east_european', // Slovak
'sk' => 'east_european', // Slovak
'sl' => 'east_european', // Slovenian
'sl' => 'east_european', // Slovenian
'sr' => 'cyrillic', // Serbian
'sr' => 'cyrillic', // Serbian
'sv' => 'west_european', // Swedish
'sv' => 'west_european', // Swedish
'sq' => 'albanian', // Albanian
'sq' => 'albanian', // Albanian
'th' => 'thai',
'uk' => 'cyrillic', // Ukranian
'uk' => 'cyrillic', // Ukranian
'vi' => 'vietnamese',
'zh' => 'chinese',
// MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
'ara' => 'arabic',
'bgr' => 'cyrillic', // Bulgarian
'bgr' => 'cyrillic', // Bulgarian
'cat' => 'west_european', // Catalan
'cat' => 'west_european', // Catalan
'chs' => 'simpl_chinese',
'cht' => 'trad_chinese',
'csy' => 'east_european', // Czech
'csy' => 'east_european', // Czech
'dan' => 'west_european', // Danisch
'dan' => 'west_european', // Danisch
'deu' => 'west_european', // German
'deu' => 'west_european', // German
'dea' => 'west_european', // German (Austrian)
'dea' => 'west_european', // German (Austrian)
'des' => 'west_european', // German (Swiss)
'des' => 'west_european', // German (Swiss)
'ena' => 'west_european', // English (Australian)
'ena' => 'west_european', // English (Australian)
'enc' => 'west_european', // English (Canadian)
'enc' => 'west_european', // English (Canadian)
'eng' => 'west_european', // English
'eng' => 'west_european', // English
'enz' => 'west_european', // English (New Zealand)
'enz' => 'west_european', // English (New Zealand)
'enu' => 'west_european', // English (United States)
'enu' => 'west_european', // English (United States)
'euq' => 'west_european', // Basque
'euq' => 'west_european', // Basque
'fos' => 'west_european', // Faroese
'fos' => 'west_european', // Faroese
'far' => 'arabic', // Persian
'far' => 'arabic', // Persian
'fin' => 'west_european', // Finish
'fin' => 'west_european', // Finish
'fra' => 'west_european', // French
'fra' => 'west_european', // French
'frb' => 'west_european', // French (Belgian)
'frb' => 'west_european', // French (Belgian)
'frc' => 'west_european', // French (Canadian)
'frc' => 'west_european', // French (Canadian)
'frs' => 'west_european', // French (Swiss)
'frs' => 'west_european', // French (Swiss)
'geo' => 'unicode', // Georgian
'geo' => 'unicode', // Georgian
'glg' => 'west_european', // Galician
'glg' => 'west_european', // Galician
'ell' => 'greek',
'heb' => 'hebrew',
'hin' => 'unicode', // Hindi
'hin' => 'unicode', // Hindi
'hun' => 'east_european', // Hungarian
'hun' => 'east_european', // Hungarian
'isl' => 'west_euorpean', // Icelandic
'isl' => 'west_euorpean', // Icelandic
'ita' => 'west_european', // Italian
'ita' => 'west_european', // Italian
'its' => 'west_european', // Italian (Swiss)
'its' => 'west_european', // Italian (Swiss)
'jpn' => 'japanese',
'khm' => 'unicode', // Khmer
'khm' => 'unicode', // Khmer
'kor' => 'korean',
'lth' => 'lithuanian',
'lvi' => 'west_european', // Latvian/Lettish
'lvi' => 'west_european', // Latvian/Lettish
'msl' => 'west_european', // Malay
'msl' => 'west_european', // Malay
'nlb' => 'west_european', // Dutch (Belgian)
'nlb' => 'west_european', // Dutch (Belgian)
'nld' => 'west_european', // Dutch
'nld' => 'west_european', // Dutch
'nor' => 'west_european', // Norwegian (bokmal)
'nor' => 'west_european', // Norwegian (bokmal)
'non' => 'west_european', // Norwegian (nynorsk)
'non' => 'west_european', // Norwegian (nynorsk)
'plk' => 'east_european', // Polish
'plk' => 'east_european', // Polish
'ptg' => 'west_european', // Portuguese
'ptg' => 'west_european', // Portuguese
'ptb' => 'west_european', // Portuguese (Brazil)
'ptb' => 'west_european', // Portuguese (Brazil)
'rom' => 'east_european', // Romanian
'rom' => 'east_european', // Romanian
'rus' => 'cyrillic', // Russian
'rus' => 'cyrillic', // Russian
'slv' => 'east_european', // Slovenian
'slv' => 'east_european', // Slovenian
'sky' => 'east_european', // Slovak
'sky' => 'east_european', // Slovak
'srl' => 'east_european', // Serbian (Latin)
'srl' => 'east_european', // Serbian (Latin)
'srb' => 'cyrillic', // Serbian (Cyrillic)
'srb' => 'cyrillic', // Serbian (Cyrillic)
'esp' => 'west_european', // Spanish (trad. sort)
'esp' => 'west_european', // Spanish (trad. sort)
'esm' => 'west_european', // Spanish (Mexican)
'esm' => 'west_european', // Spanish (Mexican)
'esn' => 'west_european', // Spanish (internat. sort)
'esn' => 'west_european', // Spanish (internat. sort)
'sve' => 'west_european', // Swedish
'sve' => 'west_european', // Swedish
'sqi' => 'albanian', // Albanian
'sqi' => 'albanian', // Albanian
'tha' => 'thai',
'trk' => 'turkish',
'ukr' => 'cyrillic', // Ukrainian
'ukr' => 'cyrillic', // Ukrainian
// English language names
'albanian' => 'albanian',
'arabic' => 'arabic',
......
);
// mapping of language (family) names to charsets on Unix
var $script_to_charset_unix=array(
var $script_to_charset_unix = array(
'west_european' => 'iso-8859-1',
'estonian' => 'iso-8859-1',
'east_european' => 'iso-8859-2',
......
);
// mapping of language (family) names to charsets on Windows
var $script_to_charset_windows=array(
var $script_to_charset_windows = array(
'east_european' => 'windows-1250',
'cyrillic' => 'windows-1251',
'west_european' => 'windows-1252',
......
);
// mapping of locale names to charsets
var $locale_to_charset=array(
var $locale_to_charset = array(
'japanese.euc' => 'euc-jp',
'ja_jp.ujis' => 'euc-jp',
'korean.euc' => 'euc-kr',
......
* @return string Normalized charset
* @author Martin Kutschker <martin.t.kutschker@blackbox.net>
*/
function parse_charset($charset) {
function parse_charset($charset) {
$charset = trim(strtolower($charset));
if (isset($this->synonyms[$charset])) $charset = $this->synonyms[$charset];
if (isset($this->synonyms[$charset])) {
$charset = $this->synonyms[$charset];
}
return $charset;
}
......
/**
* Get the charset of a locale.
*
* ln language
* ln language
* ln_CN language / country
* ln_CN language / country
* ln_CN.cs language / country / charset
* ln_CN.cs language / country / charset
* ln_CN.cs@mod language / country / charset / modifier
*
* @param string Locale string
* @return string Charset resolved for locale string
* @author Martin Kutschker <martin.t.kutschker@blackbox.net>
*/
function get_locale_charset($locale) {
function get_locale_charset($locale) {
$locale = strtolower($locale);
// exact locale specific charset?
if (isset($this->locale_to_charset[$locale])) return $this->locale_to_charset[$locale];
if (isset($this->locale_to_charset[$locale])) {
return $this->locale_to_charset[$locale];
}
// get modifier
list($locale,$modifier) = explode('@',$locale);
list($locale, $modifier) = explode('@', $locale);
// locale contains charset: use it
list($locale,$charset) = explode('.',$locale);
list($locale, $charset) = explode('.', $locale);
if ($charset) return $this->parse_charset($charset);
if ($charset) {
return $this->parse_charset($charset);
}
// modifier is 'euro' (after charset check, because of xx.utf-8@euro)
if ($modifier == 'euro') return 'iso-8859-15';
if ($modifier == 'euro') {
return 'iso-8859-15';
}
// get language
list($language,$country) = explode('_',$locale);
list($language, $country) = explode('_', $locale);
if (isset($this->lang_to_script[$language])) $script = $this->lang_to_script[$language];
if (isset($this->lang_to_script[$language])) {
$script = $this->lang_to_script[$language];
}
if (TYPO3_OS == 'WIN') {
if (TYPO3_OS == 'WIN') {
$cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
} else {
$cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'iso-8859-1';
......
}
/********************************************
*
* Charset Conversion functions
......
* @return string Converted string
* @see convArray()
*/
function conv($str,$fromCS,$toCS,$useEntityForNoChar=0) {
function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
if ($fromCS==$toCS) return $str;
if ($fromCS == $toCS) {
return $str;
}
// PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
if ($toCS=='utf-8' || !$useEntityForNoChar) {
if ($toCS == 'utf-8' || !$useEntityForNoChar) {
switch($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
case 'mbstring':
case 'mbstring':
$conv_str = mb_convert_encoding($str,$toCS,$fromCS);
$conv_str = mb_convert_encoding($str, $toCS, $fromCS);
if (false !== $conv_str) return $conv_str; // returns false for unsupported charsets
if (FALSE !== $conv_str) {
return $conv_str;
} // returns false for unsupported charsets
break;
break;
case 'iconv':
case 'iconv':
$conv_str = iconv($fromCS,$toCS.'//TRANSLIT',$str);
$conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
if (false !== $conv_str) return $conv_str;
if (FALSE !== $conv_str) {
return $conv_str;
}
break;
break;
case 'recode':
case 'recode':
$conv_str = recode_string($fromCS.'..'.$toCS,$str);
$conv_str = recode_string($fromCS . '..' . $toCS, $str);
if (false !== $conv_str) return $conv_str;
if (FALSE !== $conv_str) {
return $conv_str;
}
break;
break;
}
// fallback to TYPO3 conversion
}
if ($fromCS!='utf-8') $str=$this->utf8_encode($str,$fromCS);
if ($toCS!='utf-8') $str=$this->utf8_decode($str,$toCS,$useEntityForNoChar);
if ($fromCS != 'utf-8') {
$str = $this->utf8_encode($str, $fromCS);
}
if ($toCS != 'utf-8') {
$str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
}
return $str;
}
......
* @return void
* @see conv()
*/
function convArray(&$array,$fromCS,$toCS,$useEntityForNoChar=0) {
function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
foreach($array as $key => $value) {
foreach ($array as $key => $value) {
if (is_array($array[$key])) {
if (is_array($array[$key])) {
$this->convArray($array[$key],$fromCS,$toCS,$useEntityForNoChar);
$this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
} elseif (is_string($array[$key])) {
$array[$key] = $this->conv($array[$key],$fromCS,$toCS,$useEntityForNoChar);
$array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
}
}
}
......
* @param string Charset, lowercase. Must be found in csconvtbl/ folder.
* @return string Output string, converted to UTF-8
*/
function utf8_encode($str,$charset) {
function utf8_encode($str, $charset) {
if ($charset === 'utf-8') return $str;
if ($charset === 'utf-8') {
return $str;
}
// Charset is case-insensitive.
if ($this->initCharset($charset)) { // Parse conv. table if not already...
if ($this->initCharset($charset)) { // Parse conv. table if not already...
$strLen = strlen($str);
$outStr='';
$outStr = '';
for ($a=0;$a<$strLen;$a++) { // Traverse each char in string.
for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
$chr=substr($str,$a,1);
$chr = substr($str, $a, 1);
$ord=ord($chr);
$ord = ord($chr);
if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
$ord2 = ord($str{$a+1});
$ord2 = ord($str{$a + 1});
$ord = $ord<<8 | $ord2; // assume big endian
$ord = $ord << 8 | $ord2; // assume big endian
if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
$outStr.=$this->parsedCharsets[$charset]['local'][$ord];
$outStr .= $this->parsedCharsets[$charset]['local'][$ord];
} else $outStr.=chr($this->noCharByteVal); // No char exists
} else {
$outStr .= chr($this->noCharByteVal);
} // No char exists
$a++;
} elseif ($ord>127) { // If char has value over 127 it's a multibyte char in UTF-8
} elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
$a++;
$ord2=ord(substr($str,$a,1));
$ord2 = ord(substr($str, $a, 1));
$ord = $ord*256+$ord2;
$ord = $ord * 256 + $ord2;
}
}
if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
$outStr.= $this->parsedCharsets[$charset]['local'][$ord];
$outStr .= $this->parsedCharsets[$charset]['local'][$ord];
} else $outStr.= chr($this->noCharByteVal); // No char exists
} else $outStr.= $chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
} else {
$outStr .= chr($this->noCharByteVal);
} // No char exists
} else {
$outStr .= $chr;
} // ... otherwise it's just ASCII 0-127 and one byte. Transparent
}
return $outStr;
}
......
* @param boolean If set, then characters that are not available in the destination character set will be encoded as numeric entities
* @return string Output string, converted to local charset
*/
function utf8_decode($str,$charset,$useEntityForNoChar=0) {
function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
if ($charset === 'utf-8') {
return $str;
}
// Charset is case-insensitive.
if ($this->initCharset($charset)) { // Parse conv. table if not already...
if ($this->initCharset($charset)) { // Parse conv. table if not already...
$strLen = strlen($str);
$outStr='';
$outStr = '';
$buf='';
$buf = '';
for ($a=0,$i=0;$a<$strLen;$a++,$i++) { // Traverse each char in UTF-8 string.
for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
$chr=substr($str,$a,1);
$chr = substr($str, $a, 1);
$ord=ord($chr);
$ord = ord($chr);
if ($ord>127) { // This means multibyte! (first byte!)
if ($ord > 127) { // This means multibyte! (first byte!)
if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
$buf=$chr; // Add first byte
$buf = $chr; // Add first byte
for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
$ord = $ord << 1; // Shift it left and ...
$ord = $ord << 1; // Shift it left and ...
if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
$a++; // Increase pointer...
$a++; // Increase pointer...
$buf.=substr($str,$a,1); // ... and add the next char.
$buf .= substr($str, $a, 1); // ... and add the next char.
} else break;
} else {
break;
}
}
}
if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
$mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
$mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
if ($mByte>255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
$outStr.= chr(($mByte >> 8) & 255).chr($mByte & 255);
$outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
} else $outStr.= chr($mByte);
} else {
$outStr .= chr($mByte);
}
} elseif ($useEntityForNoChar) { // Create num entity:
} elseif ($useEntityForNoChar) { // Create num entity:
$outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
$outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
} else $outStr.=chr($this->noCharByteVal); // No char exists
} else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
} else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
} else {
$outStr .= chr($this->noCharByteVal);
} // No char exists
} else {
$outStr .= chr($this->noCharByteVal);
} // No char exists (MIDDLE of MB sequence!)
} else {
$outStr .= $chr;
} // ... otherwise it's just ASCII 0-127 and one byte. Transparent
}
return $outStr;
}
......
* @param string Input string
* @return string Output string
*/
function utf8_to_entities($str) {
function utf8_to_entities($str) {
$strLen = strlen($str);
$outStr='';
$outStr = '';
$buf='';
$buf = '';
for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
$chr=substr($str,$a,1);
$chr = substr($str, $a, 1);
$ord=ord($chr);
$ord = ord($chr);
if ($ord>127) { // This means multibyte! (first byte!)
if ($ord > 127) { // This means multibyte! (first byte!)
if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
$buf=$chr; // Add first byte
$buf = $chr; // Add first byte
for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
$ord = $ord << 1; // Shift it left and ...
$ord = $ord << 1; // Shift it left and ...
if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
$a++; // Increase pointer...
$a++; // Increase pointer...
$buf.=substr($str,$a,1); // ... and add the next char.
$buf .= substr($str, $a, 1); // ... and add the next char.
} else break;
} else {
break;
}
}
}
$outStr.='&#'.$this->utf8CharToUnumber($buf,1).';';
$outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
} else $outStr.=chr($this->noCharByteVal); // No char exists (MIDDLE of MB sequence!)
} else $outStr.=$chr; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
} else {
$outStr .= chr($this->noCharByteVal);
} // No char exists (MIDDLE of MB sequence!)
} else {
$outStr .= $chr;
} // ... otherwise it's just ASCII 0-127 and one byte. Transparent
}
return $outStr;
......
* @param boolean If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
* @return string Output string
*/
function entities_to_utf8($str,$alsoStdHtmlEnt=0) {
function entities_to_utf8($str, $alsoStdHtmlEnt = 0) {
if ($alsoStdHtmlEnt) {
if ($alsoStdHtmlEnt) {
$trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
$trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES)); // Getting them in iso-8859-1 - but thats ok since this is observed below.
}
$token = md5(microtime());
$parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
foreach($parts as $k => $v) {
foreach ($parts as $k => $v) {
if ($k%2) {
if ($k % 2) {
if (substr($v,0,1)=='#') { // Dec or hex entities:
if (substr($v, 0, 1) == '#') { // Dec or hex entities:
if (substr($v,1,1)=='x') {
if (substr($v, 1, 1) == 'x') {
$parts[$k] = $this->UnumberToChar(hexdec(substr($v,2)));
$parts[$k] = $this->UnumberToChar(hexdec(substr($v, 2)));
} else {
$parts[$k] = $this->UnumberToChar(substr($v,1));
$parts[$k] = $this->UnumberToChar(substr($v, 1));
}
} elseif ($alsoStdHtmlEnt && $trans_tbl['&'.$v.';']) { // Other entities:
} elseif ($alsoStdHtmlEnt && $trans_tbl['&' . $v . ';']) { // Other entities:
$parts[$k] = $this->utf8_encode($trans_tbl['&'.$v.';'],'iso-8859-1');
$parts[$k] = $this->utf8_encode($trans_tbl['&' . $v . ';'], 'iso-8859-1');
} else { // No conversion:
} else { // No conversion:
$parts[$k] ='&'.$v.';';
$parts[$k] = '&' . $v . ';';
}
}
}
return implode('',$parts);
return implode('', $parts);
}
/**
......
* @param boolean If set, then instead of integer numbers the real UTF-8 char is returned.
* @return array Output array with the char numbers
*/
function utf8_to_numberarray($str,$convEntities=0,$retChar=0) {
function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
// If entities must be registered as well...:
if ($convEntities) {
if ($convEntities) {
$str = $this->entities_to_utf8($str,1);
$str = $this->entities_to_utf8($str, 1);
}
// Do conversion:
$strLen = strlen($str);
$outArr=array();
$outArr = array();
$buf='';
$buf = '';
for ($a=0;$a<$strLen;$a++) { // Traverse each char in UTF-8 string.
for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
$chr=substr($str,$a,1);
$chr = substr($str, $a, 1);
$ord=ord($chr);
$ord = ord($chr);
if ($ord>127) { // This means multibyte! (first byte!)
if ($ord > 127) { // This means multibyte! (first byte!)
if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
$buf=$chr; // Add first byte
$buf = $chr; // Add first byte
for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
$ord = $ord << 1; // Shift it left and ...
$ord = $ord << 1; // Shift it left and ...
if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
$a++; // Increase pointer...
$a++; // Increase pointer...
$buf.=substr($str,$a,1); // ... and add the next char.
$buf .= substr($str, $a, 1); // ... and add the next char.
} else break;
} else {
break;
}
}
}
$outArr[]=$retChar?$buf:$this->utf8CharToUnumber($buf);
$outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
} else $outArr[]=$retChar?chr($this->noCharByteVal):$this->noCharByteVal; // No char exists (MIDDLE of MB sequence!)
} else $outArr[]=$retChar?chr($ord):$ord; // ... otherwise it's just ASCII 0-127 and one byte. Transparent
} else {
$outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
} // No char exists (MIDDLE of MB sequence!)
} else {
$outArr[] = $retChar ? chr($ord) : $ord;
} // ... otherwise it's just ASCII 0-127 and one byte. Transparent
}
return $outArr;
......
* The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
*
* bytes | bits | representation
* 1 | 7 | 0vvvvvvv
* 1 | 7 | 0vvvvvvv
* 2 | 11 | 110vvvvv 10vvvvvv
* 2 | 11 | 110vvvvv 10vvvvvv
* 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
* 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv
* 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
* 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
* 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
* 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
* 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
* 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
*
* @param integer UNICODE integer
* @return string UTF-8 multibyte character string
* @see utf8CharToUnumber()
*/
function UnumberToChar($cbyte) {
function UnumberToChar($cbyte) {
$str='';
$str = '';
if ($cbyte < 0x80) {
$str.=chr($cbyte);
$str .= chr($cbyte);
} else if ($cbyte < 0x800) {
} else {
if ($cbyte < 0x800) {
$str.=chr(0xC0 | ($cbyte >> 6));
$str .= chr(0xC0 | ($cbyte >> 6));
$str.=chr(0x80 | ($cbyte & 0x3F));
$str .= chr(0x80 | ($cbyte & 0x3F));
} else if ($cbyte < 0x10000) {
} else {
if ($cbyte < 0x10000) {
$str.=chr(0xE0 | ($cbyte >> 12));
$str .= chr(0xE0 | ($cbyte >> 12));
$str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
$str.=chr(0x80 | ($cbyte & 0x3F));
$str .= chr(0x80 | ($cbyte & 0x3F));
} else if ($cbyte < 0x200000) {
} else {
if ($cbyte < 0x200000) {
$str.=chr(0xF0 | ($cbyte >> 18));
$str .= chr(0xF0 | ($cbyte >> 18));
$str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
$str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
$str.=chr(0x80 | ($cbyte & 0x3F));
$str .= chr(0x80 | ($cbyte & 0x3F));
} else if ($cbyte < 0x4000000) {
} else {
if ($cbyte < 0x4000000) {
$str.=chr(0xF8 | ($cbyte >> 24));
$str .= chr(0xF8 | ($cbyte >> 24));
$str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
$str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
$str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
$str.=chr(0x80 | ($cbyte & 0x3F));
$str .= chr(0x80 | ($cbyte & 0x3F));
} else if ($cbyte < 0x80000000) {
} else {
if ($cbyte < 0x80000000) {
$str.=chr(0xFC | ($cbyte >> 30));
$str .= chr(0xFC | ($cbyte >> 30));
$str.=chr(0x80 | (($cbyte >> 24) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
$str.=chr(0x80 | (($cbyte >> 18) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
$str.=chr(0x80 | (($cbyte >> 12) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
$str.=chr(0x80 | (($cbyte >> 6) & 0x3F));
$str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
$str.=chr(0x80 | ($cbyte & 0x3F));
$str .= chr(0x80 | ($cbyte & 0x3F));
} else { // Cannot express a 32-bit character in UTF-8
$str .= chr($this->noCharByteVal);
}
} else { // Cannot express a 32-bit character in UTF-8
$str .= chr($this->noCharByteVal);
}
}
}
}
}
}
return $str;
}
......
* @return integer UNICODE integer
* @see UnumberToChar()
*/
function utf8CharToUnumber($str,$hex=0) {
function utf8CharToUnumber($str, $hex = 0) {
$ord=ord(substr($str,0,1)); // First char
$ord = ord(substr($str, 0, 1)); // First char
if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
$binBuf='';
$binBuf = '';
for ($b=0;$b<8;$b++) { // for each byte in multibyte string...
for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
$ord = $ord << 1; // Shift it left and ...
$ord = $ord << 1; // Shift it left and ...
if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
$binBuf.=substr('00000000'.decbin(ord(substr($str,$b+1,1))),-6);
$binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
} else break;
} else {
break;
}
}
}
$binBuf=substr('00000000'.decbin(ord(substr($str,0,1))),-(6-$b)).$binBuf;
$binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
$int = bindec($binBuf);
} else $int = $ord;
} else {
$int = $ord;
}
return $hex ? 'x'.dechex($int) : $int;
return $hex ? 'x' . dechex($int) : $int;
}
/********************************************
*
* Init functions
......
* @return integer Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
* @access private
*/
function initCharset($charset) {
function initCharset($charset) {
// Only process if the charset is not yet loaded:
if (!is_array($this->parsedCharsets[$charset])) {
if (!is_array($this->parsedCharsets[$charset])) {
// Conversion table filename:
$charsetConvTableFile = PATH_t3lib.'csconvtbl/'.$charset.'.tbl';
$charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
// If the conversion table is found:
if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
// Cache file for charsets:
// Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
$cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_'.$charset.'.tbl');
$cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
if ($cacheFile && @is_file($cacheFile)) {
if ($cacheFile && @is_file($cacheFile)) {
$this->parsedCharsets[$charset]=unserialize(t3lib_div::getUrl($cacheFile));
$this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
} else {
// Parse conversion table into lines:
$lines=t3lib_div::trimExplode(LF,t3lib_div::getUrl($charsetConvTableFile),1);
$lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
// Initialize the internal variable holding the conv. table:
$this->parsedCharsets[$charset]=array('local'=>array(),'utf8'=>array());
$this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
// traverse the lines:
$detectedType='';
$detectedType = '';
foreach($lines as $value) {
foreach ($lines as $value) {
if (trim($value) && substr($value,0,1)!='#') { // Comment line or blanks are ignored.
if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
// Detect type if not done yet: (Done on first real line)
// The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
if (!$detectedType) $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value) ? 'whitespaced' : 'ms-token';
if (!$detectedType) {
$detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
}
if ($detectedType=='ms-token') {
if ($detectedType == 'ms-token') {
list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
} elseif ($detectedType=='whitespaced') {
} elseif ($detectedType == 'whitespaced') {
$regA=array();
$regA = array();
preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/',$value,$regA);
preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
$hexbyte = $regA[1];
$utf8 = 'U+'.$regA[2];
$utf8 = 'U+' . $regA[2];
}
$decval = hexdec(trim($hexbyte));
if ($decval>127) {
if ($decval > 127) {
$utf8decval = hexdec(substr(trim($utf8),2));
$utf8decval = hexdec(substr(trim($utf8), 2));
$this->parsedCharsets[$charset]['local'][$decval]=$this->UnumberToChar($utf8decval);
$this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
$this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]]=$decval;
$this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
}
}
}
if ($cacheFile) {
if ($cacheFile) {
t3lib_div::writeFileToTypo3tempDir($cacheFile,serialize($this->parsedCharsets[$charset]));
t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
}
}
return 2;
} else return false;
} else return 1;
} else {
return FALSE;
}
}
} else {
return 1;
}
}
/**
* This function initializes all UTF-8 character data tables.
......
* @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
* @access private
*/
function initUnicodeData($mode=null) {
function initUnicodeData($mode = NULL) {
// cache files
$cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
$cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
// Only process if the tables are not yet loaded
switch($mode) {
switch ($mode) {
case 'case':
if (is_array($this->caseFolding['utf-8'])) return 1;
if (is_array($this->caseFolding['utf-8'])) {
return 1;
}
// Use cached version if possible
if ($cacheFileCase && @is_file($cacheFileCase)) {
if ($cacheFileCase && @is_file($cacheFileCase)) {
$this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
return 2;
}
break;
case 'ascii':
if (is_array($this->toASCII['utf-8'])) return 1;
if (is_array($this->toASCII['utf-8'])) {
return 1;
}
// Use cached version if possible
if ($cacheFileASCII && @is_file($cacheFileASCII)) {
if ($cacheFileASCII && @is_file($cacheFileASCII)) {
$this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
return 2;
}
......
}
// process main Unicode data file
$unicodeDataFile = PATH_t3lib.'unidata/UnicodeData.txt';
$unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) return false;
if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
return FALSE;
}
$fh = fopen($unicodeDataFile,'rb');
$fh = fopen($unicodeDataFile, 'rb');
if (!$fh) return false;
if (!$fh) {
return FALSE;
}
// key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
// note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
......
$utf8CaseFolding['toLower'] = array();
$utf8CaseFolding['toTitle'] = array();
$decomposition = array(); // array of temp. decompositions
$decomposition = array(); // array of temp. decompositions
$mark = array(); // array of chars that are marks (eg. composing accents)
$mark = array(); // array of chars that are marks (eg. composing accents)
$number = array(); // array of chars that are numbers (eg. digits)
$number = array(); // array of chars that are numbers (eg. digits)
$omit = array(); // array of chars to be omitted (eg. Russian hard sign)
$omit = array(); // array of chars to be omitted (eg. Russian hard sign)
while (!feof($fh)) {
while (!feof($fh)) {
$line = fgets($fh,4096);
$line = fgets($fh, 4096);
// has a lot of info
list($char,$name,$cat,,,$decomp,,,$num,,,,$upper,$lower,$title,) = explode(';', rtrim($line));
list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
$ord = hexdec($char);
if ($ord > 0xFFFF) break; // only process the BMP
if ($ord > 0xFFFF) {
break;
} // only process the BMP
$utf8_char = $this->UnumberToChar($ord);
if ($upper) $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
if ($lower) $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
if ($upper) {
$utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
}
if ($lower) {
$utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
}
// store "title" only when different from "upper" (only a few)
if ($title && $title != $upper) $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
if ($title && $title != $upper) {
$utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
}
switch ($cat{0}) {
switch ($cat{0}) {
case 'M': // mark (accent, umlaut, ...)
case 'M': // mark (accent, umlaut, ...)
$mark["U+$char"] = 1;
break;
case 'N': // numeric value
case 'N': // numeric value
if ($ord > 0x80 && $num != '') $number["U+$char"] = $num;
if ($ord > 0x80 && $num != '') {
$number["U+$char"] = $num;
}
}
}
// accented Latin letters without "official" decomposition
$match = array();
if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/',$name,$match) && !$decomp) {
if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
$c = ord($match[2]);
if ($match[1] == 'SMALL') $c += 32;
if ($match[1] == 'SMALL') {
$c += 32;
}
$decomposition["U+$char"] = array(dechex($c));
continue;
}
$match = array();
if (preg_match('/(<.*>)? *(.+)/',$decomp,$match)) {
if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
switch($match[1]) {
switch ($match[1]) {
case '<circle>': // add parenthesis as circle replacement, eg (1)
case '<circle>': // add parenthesis as circle replacement, eg (1)
$match[2] = '0028 '.$match[2].' 0029';
$match[2] = '0028 ' . $match[2] . ' 0029';
break;
case '<square>': // add square brackets as square replacement, eg [1]
case '<square>': // add square brackets as square replacement, eg [1]
$match[2] = '005B '.$match[2].' 005D';
$match[2] = '005B ' . $match[2] . ' 005D';
break;
case '<compat>': // ignore multi char decompositions that start with a space
case '<compat>': // ignore multi char decompositions that start with a space
if (preg_match('/^0020 /',$match[2])) continue 2;
if (preg_match('/^0020 /', $match[2])) {
continue 2;
}
break;
// ignore Arabic and vertical layout presentation decomposition
......
fclose($fh);
// process additional Unicode data for casing (allow folded characters to expand into a sequence)
$specialCasingFile = PATH_t3lib.'unidata/SpecialCasing.txt';
$specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
$fh = fopen($specialCasingFile,'rb');
$fh = fopen($specialCasingFile, 'rb');
if ($fh) {
if ($fh) {
while (!feof($fh)) {
while (!feof($fh)) {
$line = fgets($fh,4096);
$line = fgets($fh, 4096);
if ($line{0} != '#' && trim($line) != '') {
if ($line{0} != '#' && trim($line) != '') {
list($char,$lower,$title,$upper,$cond) = t3lib_div::trimExplode(';', $line);
list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
if ($cond == '' || $cond{0} == '#') {
if ($cond == '' || $cond{0} == '#') {
$utf8_char = $this->UnumberToChar(hexdec($char));
if ($char != $lower) {
if ($char != $lower) {
$arr = explode(' ', $lower);
for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
$utf8CaseFolding['toLower'][$utf8_char] = implode('',$arr);
$utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
}
if ($char != $title && $title != $upper) {
if ($char != $title && $title != $upper) {
$arr = explode(' ', $title);
for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
$utf8CaseFolding['toTitle'][$utf8_char] = implode('',$arr);
$utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
}
if ($char != $upper) {
if ($char != $upper) {
$arr = explode(' ', $upper);
$arr = explode(' ', $upper);
for ($i=0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
for ($i = 0; isset($arr[$i]); $i++) $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
$utf8CaseFolding['toUpper'][$utf8_char] = implode('',$arr);
$utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
}
}
}
......
}
// process custom decompositions
$customTranslitFile = PATH_t3lib.'unidata/Translit.txt';
$customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
$fh = fopen($customTranslitFile,'rb');
$fh = fopen($customTranslitFile, 'rb');
if ($fh) {
if ($fh) {
while (!feof($fh)) {
while (!feof($fh)) {
$line = fgets($fh,4096);
$line = fgets($fh, 4096);
if ($line{0} != '#' && trim($line) != '') {
if ($line{0} != '#' && trim($line) != '') {
list($char,$translit) = t3lib_div::trimExplode(';', $line);
list($char, $translit) = t3lib_div::trimExplode(';', $line);
if (!$translit) $omit["U+$char"] = 1;
if (!$translit) {
$omit["U+$char"] = 1;
}
$decomposition["U+$char"] = explode(' ', $translit);
}
......
}
// decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
foreach($decomposition as $from => $to) {
foreach ($decomposition as $from => $to) {
$code_decomp = array();
while ($code_value = array_shift($to)) {
while ($code_value = array_shift($to)) {
if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
foreach(array_reverse($decomposition["U+$code_value"]) as $cv) {
foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
array_unshift($to, $cv);
}
} elseif (!isset($mark["U+$code_value"])) { // remove mark
} elseif (!isset($mark["U+$code_value"])) { // remove mark
array_push($code_decomp, $code_value);
}
}
if (count($code_decomp) || isset($omit[$from])) {
if (count($code_decomp) || isset($omit[$from])) {
$decomposition[$from] = $code_decomp;
} else {
unset($decomposition[$from]);
......
$this->toASCII['utf-8'] = array();
$ascii =& $this->toASCII['utf-8'];
foreach($decomposition as $from => $to) {
foreach ($decomposition as $from => $to) {
$code_decomp = array();
while ($code_value = array_shift($to)) {
while ($code_value = array_shift($to)) {
$ord = hexdec($code_value);
if ($ord > 127)
continue 2; // skip decompositions containing non-ASCII chars
if ($ord > 127) {
continue 2;
} // skip decompositions containing non-ASCII chars
else
{
array_push($code_decomp,chr($ord));
array_push($code_decomp, chr($ord));
}
}
}
$ascii[$this->UnumberToChar(hexdec($from))] = join('',$code_decomp);
$ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
}
// add numeric decompositions
foreach($number as $from => $to) {
foreach ($number as $from => $to) {
$utf8_char = $this->UnumberToChar(hexdec($from));
if (!isset($ascii[$utf8_char])) {
if (!isset($ascii[$utf8_char])) {
$ascii[$utf8_char] = $to;
}
}
if ($cacheFileCase) {
if ($cacheFileCase) {
t3lib_div::writeFileToTypo3tempDir($cacheFileCase,serialize($utf8CaseFolding));
t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
}
if ($cacheFileASCII) {
if ($cacheFileASCII) {
t3lib_div::writeFileToTypo3tempDir($cacheFileASCII,serialize($ascii));
t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
}
return 3;
......
* @return integer Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
* @access private
*/
function initCaseFolding($charset) {
function initCaseFolding($charset) {
// Only process if the case table is not yet loaded:
if (is_array($this->caseFolding[$charset])) return 1;
if (is_array($this->caseFolding[$charset])) {
return 1;
}
// Use cached version if possible
$cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_'.$charset.'.tbl');
$cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
if ($cacheFile && @is_file($cacheFile)) {
if ($cacheFile && @is_file($cacheFile)) {
$this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
return 2;
}
// init UTF-8 conversion for this charset
if (!$this->initCharset($charset)) {
if (!$this->initCharset($charset)) {
return false;
... This diff was truncated because it exceeds the maximum size that can be displayed.
(16-16/93)