Issue #306 - The multibute string library: Restructuring the functions - those for internal use are renamed and moved in the correspondent file.

skala
Ivan Tcholakov 15 years ago
parent 9376bb9d20
commit a3cb586fd0
  1. 879
      main/inc/lib/multibyte_string_functions.lib.php
  2. 318
      main/inc/lib/multibyte_string_functions_internal.lib.php
  3. 16
      tests/main/inc/lib/multibyte_string_functions.lib.test.php

File diff suppressed because it is too large Load Diff

@ -260,9 +260,9 @@ function _api_utf8_to_unicode($string) {
}
/**
* Takes an array of ints representing the Unicode characters and returns a UTF-8 string.
* @param array $codepoints An array of unicode code points representing a string.
* @return string Returns a UTF-8 string constructed using the given code points.
* Takes an array of codepoints (integer) representing Unicode characters and returns a UTF-8 string.
* @param array $codepoints An array of Unicode codepoints representing a string.
* @return string Returns a UTF-8 string constructed using the given codepoints.
*/
function _api_utf8_from_unicode($codepoints) {
return implode(array_map('_api_utf8_chr', $codepoints));
@ -525,6 +525,218 @@ function _api_get_collator_sort_flag($sort_flag = SORT_REGULAR) {
* ----------------------------------------------------------------------------
*/
/**
* Returns a table with non-UTF-8 encodings for all system languages.
* @return array Returns an array in the form array('language1' => array('encoding1', encoding2', ...), ...)
* Note: The function api_get_non_utf8_encoding() returns the first encoding from this array that is correspondent to the given language.
*/
function & _api_non_utf8_encodings() {
// The following list may have some inconsistencies.
// Place the most used for your language encoding at the first place.
// If you are adding an encoding, check whether it is supported either by
// mbstring library, either by iconv library.
// If you modify this list, please, follow the given syntax exactly.
// The language names must be stripped of any suffixes, such as _unicode, _corporate, _org, etc.
static $encodings =
'
arabic: WINDOWS-1256, ISO-8859-6;
asturian: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
bosnian: WINDOWS-1250;
brazilian: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
bulgarian: WINDOWS-1251;
catalan: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
croatian: WINDOWS-1250;
czech: WINDOWS-1250, ISO-8859-2;
danish: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
dari: WINDOWS-1256;
dutch: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
english: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
euskera: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
esperanto: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
finnish: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
french: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
friulian: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
galician: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
georgian: GEORGIAN-ACADEMY, GEORGIAN-PS;
german: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
greek: WINDOWS-1253, ISO-8859-7;
hebrew: ISO-8859-8, WINDOWS-1255;
hungarian: WINDOWS-1250, ISO-8859-2;
indonesian: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
italian: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
japanese: EUC-JP, ISO-2022-JP, Shift-JIS;
korean: EUC-KR, ISO-2022-KR, CP949;
latvian: WINDOWS-1257, ISO-8859-13;
lithuanian: WINDOWS-1257, ISO-8859-13;
macedonian: WINDOWS-1251;
malay: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
norwegian: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
occitan: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
pashto: WINDOWS-1256;
persian: WINDOWS-1256;
polish: WINDOWS-1250, ISO-8859-2;
portuguese: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
quechua_cusco: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
romanian: WINDOWS-1250, ISO-8859-2;
russian: KOI8-R, WINDOWS-1251;
serbian: ISO-8859-15, WINDOWS-1252, ISO-8859-1, WINDOWS-1251;
simpl_chinese: GB2312, WINDOWS-936;
slovak: WINDOWS-1250, ISO-8859-2;
slovenian: WINDOWS-1250, ISO-8859-2;
spanish: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
swahili: ISO-8859-1;
swedish: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
thai: WINDOWS-874, ISO-8859-11;
trad_chinese: BIG-5, EUC-TW;
turkce: WINDOWS-1254, ISO-8859-9;
ukrainian: KOI8-U;
vietnamese: WINDOWS-1258, VISCII, TCVN;
yoruba: ISO-8859-15, WINDOWS-1252, ISO-8859-1;
';
if (!is_array($encodings)) {
$table = explode(';', str_replace(' ', '', $encodings));
$encodings = array();
foreach ($table as & $row) {
$row = trim($row);
if (!empty($row)) {
$row = explode(':', $row);
$encodings[$row[0]] = explode(',', strtoupper($row[1]));
}
}
}
return $encodings;
}
/**
* Sets/Gets internal character encoding of the common string functions within the PHP mbstring extension.
* @param string $encoding (optional) When this parameter is given, the function sets the internal encoding.
* @return string When $encoding parameter is not given, the function returns the internal encoding.
* Note: This function is used in the global initialization script for setting the internal encoding to the platform's character set.
* @link http://php.net/manual/en/function.mb-internal-encoding
*/
function _api_mb_internal_encoding($encoding = null) {
static $mb_internal_encoding = null;
if (empty($encoding)) {
if (is_null($mb_internal_encoding)) {
if (MBSTRING_INSTALLED) {
$mb_internal_encoding = @mb_internal_encoding();
} else {
$mb_internal_encoding = 'ISO-8859-15';
}
}
return $mb_internal_encoding;
}
$mb_internal_encoding = $encoding;
if (_api_mb_supports($encoding)) {
return @mb_internal_encoding($encoding);
}
return false;
}
/**
* Sets/Gets internal character encoding of the regular expression functions (ereg-like) within the PHP mbstring extension.
* @param string $encoding (optional) When this parameter is given, the function sets the internal encoding.
* @return string When $encoding parameter is not given, the function returns the internal encoding.
* Note: This function is used in the global initialization script for setting the internal encoding to the platform's character set.
* @link http://php.net/manual/en/function.mb-regex-encoding
*/
function _api_mb_regex_encoding($encoding = null) {
static $mb_regex_encoding = null;
if (empty($encoding)) {
if (is_null($mb_regex_encoding)) {
if (MBSTRING_INSTALLED) {
$mb_regex_encoding = @mb_regex_encoding();
} else {
$mb_regex_encoding = 'ISO-8859-15';
}
}
return $mb_regex_encoding;
}
$mb_regex_encoding = $encoding;
if (_api_mb_supports($encoding)) {
return @mb_regex_encoding($encoding);
}
return false;
}
/**
* Retrieves specified internal encoding configuration variable within the PHP iconv extension.
* @param string $type The parameter $type could be: 'iconv_internal_encoding', 'iconv_input_encoding', or 'iconv_output_encoding'.
* @return mixed The function returns the requested encoding or FALSE on error.
* @link http://php.net/manual/en/function.iconv-get-encoding
*/
function _api_iconv_get_encoding($type) {
return _api_iconv_set_encoding($type);
}
/**
* Sets specified internal encoding configuration variables within the PHP iconv extension.
* @param string $type The parameter $type could be: 'iconv_internal_encoding', 'iconv_input_encoding', or 'iconv_output_encoding'.
* @param string $encoding (optional) The desired encoding to be set.
* @return bool Returns TRUE on success, FALSE on error.
* Note: This function is used in the global initialization script for setting these three internal encodings to the platform's character set.
* @link http://php.net/manual/en/function.iconv-set-encoding
*/
// Sets current setting for character encoding conversion.
// The parameter $type could be: 'iconv_internal_encoding', 'iconv_input_encoding', or 'iconv_output_encoding'.
function _api_iconv_set_encoding($type, $encoding = null) {
static $iconv_internal_encoding = null;
static $iconv_input_encoding = null;
static $iconv_output_encoding = null;
if (!ICONV_INSTALLED) {
return false;
}
switch ($type) {
case 'iconv_internal_encoding':
if (empty($encoding)) {
if (is_null($iconv_internal_encoding)) {
$iconv_internal_encoding = @iconv_get_encoding($type);
}
return $iconv_internal_encoding;
}
if (_api_iconv_supports($encoding)) {
if(@iconv_set_encoding($type, $encoding)) {
$iconv_internal_encoding = $encoding;
return true;
}
return false;
}
return false;
case 'iconv_input_encoding':
if (empty($encoding)) {
if (is_null($iconv_input_encoding)) {
$iconv_input_encoding = @iconv_get_encoding($type);
}
return $iconv_input_encoding;
}
if (_api_iconv_supports($encoding)) {
if(@iconv_set_encoding($type, $encoding)) {
$iconv_input_encoding = $encoding;
return true;
}
return false;
}
return false;
case 'iconv_output_encoding':
if (empty($encoding)) {
if (is_null($iconv_output_encoding)) {
$iconv_output_encoding = @iconv_get_encoding($type);
}
return $iconv_output_encoding;
}
if (_api_iconv_supports($encoding)) {
if(@iconv_set_encoding($type, $encoding)) {
$iconv_output_encoding = $encoding;
return true;
}
return false;
}
return false;
}
return false;
}
// Ckecks whether a given encoding defines single-byte characters.
// The result might be not accurate for unknown by this library encodings.
function _api_is_single_byte_encoding($encoding) {
@ -536,6 +748,48 @@ function _api_is_single_byte_encoding($encoding) {
return $checked[$encoding];
}
/**
* Checks whether the specified encoding is supported by the PHP mbstring extension.
* @param string $encoding The specified encoding.
* @return bool Returns TRUE when the specified encoding is supported, FALSE othewise.
*/
function _api_mb_supports($encoding) {
static $supported = array();
$encoding = api_refine_encoding_id($encoding);
if (!isset($supported[$encoding])) {
if (MBSTRING_INSTALLED) {
$mb_encodings = mb_list_encodings();
$mb_encodings = array_map('api_refine_encoding_id', $mb_encodings);
} else {
$mb_encodings = array();
}
$supported[$encoding] = in_array($encoding, $mb_encodings);
}
return $supported[$encoding];
}
/**
* Checks whether the specified encoding is supported by the PHP iconv extension.
* @param string $encoding The specified encoding.
* @return bool Returns TRUE when the specified encoding is supported, FALSE othewise.
*/
function _api_iconv_supports($encoding) {
static $supported = array();
$encoding = api_refine_encoding_id($encoding);
if (!isset($supported[$encoding])) {
if (ICONV_INSTALLED) {
$test_string = '';
for ($i = 32; $i < 128; $i++) {
$test_string .= chr($i);
}
$supported[$encoding] = (@iconv_strlen($test_string, $encoding)) ? true : false;
} else {
$supported[$encoding] = false;
}
}
return $supported[$encoding];
}
// This function checks whether the function _api_convert_encoding() (the php-
// implementation) is able to convert from/to a given encoding.
function _api_convert_encoding_supports($encoding) {
@ -546,6 +800,64 @@ function _api_convert_encoding_supports($encoding) {
return $supports[encoding];
}
/**
* Checks whether the specified encoding is supported by the html-entitiy related functions.
* @param string $encoding The specified encoding.
* @return bool Returns TRUE when the specified encoding is supported, FALSE othewise.
*/
function _api_html_entity_supports($encoding) {
static $supported = array();
$encoding = api_refine_encoding_id($encoding);
if (!isset($supported[$encoding])) {
// See http://php.net/manual/en/function.htmlentities.php
$html_entity_encodings = array(explode(',',
'
ISO-8859-1, ISO8859-1,
ISO-8859-15, ISO8859-15,
UTF-8,
cp866, ibm866, 866,
cp1251, Windows-1251, win-1251, 1251,
cp1252, Windows-1252, 1252,
KOI8-R, koi8-ru, koi8r,
BIG5, 950,
GB2312, 936,
BIG5-HKSCS,
Shift_JIS, SJIS, 932,
EUC-JP, EUCJP
'));
$html_entity_encodings = array_map('trim', $html_entity_encodings);
$html_entity_encodings = array_map('api_refine_encoding_id', $html_entity_encodings);
$supported[$encoding] = in_array($encoding, $html_entity_encodings);
}
return $supported[$encoding] ? true : false;
}
/**
* ----------------------------------------------------------------------------
* Appendix to "Language management functions"
* ----------------------------------------------------------------------------
*/
/**
* This function returns an array of those languages that can use Latin 1 encoding.
* @return array The array of languages that can use Latin 1 encoding (ISO-8859-15, ISO-8859-1, WINDOWS-1252, ...).
* Note: The returned language identificators are purified, without suffixes.
*/
function _api_get_latin1_compatible_languages() {
static $latin1_languages;
if (!isset($latin1_languages)) {
$latin1_languages = array();
$encodings = & _api_non_utf8_encodings();
foreach ($encodings as $key => $value) {
if (api_is_latin1($value[0])) {
$latin1_languages[] = $key;
}
}
}
return $latin1_languages;
}
/**
* ----------------------------------------------------------------------------

@ -626,7 +626,7 @@ class TestMultibyte_String_Functions extends UnitTestCase {
}
public function testApiNonUtf8Encodings(){
$res = & api_non_utf8_encodings();
$res = & _api_non_utf8_encodings();
$this->assertTrue($res);
$this->assertTrue(is_array($res));
//var_dump($res);
@ -692,7 +692,7 @@ class TestMultibyte_String_Functions extends UnitTestCase {
public function testapi_mb_internal_encoding(){
$encoding = null;
$res = api_mb_internal_encoding($encoding);
$res = _api_mb_internal_encoding($encoding);
$this->assertTrue(is_string($res));
$this->assertTrue($res);
//var_dump($res);
@ -700,7 +700,7 @@ class TestMultibyte_String_Functions extends UnitTestCase {
public function testapi_mb_regex_encoding(){
$encoding = null;
$res = api_mb_regex_encoding($encoding);
$res = _api_mb_regex_encoding($encoding);
$this->assertTrue(is_string($res));
$this->assertTrue($res);
//var_dump($res);
@ -708,7 +708,7 @@ class TestMultibyte_String_Functions extends UnitTestCase {
public function testapi_iconv_get_encoding($type){
$type='UTF-8';
$res = api_iconv_get_encoding($type);
$res = _api_iconv_get_encoding($type);
if(!is_string($res)) :
$this->assertTrue(is_bool($res));
$this->assertTrue($res=== true || $res === false);
@ -719,7 +719,7 @@ class TestMultibyte_String_Functions extends UnitTestCase {
public function testApiIconvSetEncoding(){
$type='UTF-8';
$encoding = null;
$res = api_iconv_set_encoding($type, $encoding);
$res = _api_iconv_set_encoding($type, $encoding);
$this->assertTrue(is_bool($res));
$this->assertTrue($res === true|| $res === false);
//var_dump($res);
@ -736,7 +736,7 @@ class TestMultibyte_String_Functions extends UnitTestCase {
public function testapi_mb_supports(){
$encoding='UTF-8';
$res = api_mb_supports($encoding);
$res = _api_mb_supports($encoding);
$this->assertTrue(is_bool($res));
$this->assertTrue($res === true || $res === false);
//var_dump($res);
@ -744,7 +744,7 @@ class TestMultibyte_String_Functions extends UnitTestCase {
public function testapi_iconv_supports(){
$encoding='UTF-8';
$res = api_iconv_supports($encoding);
$res = _api_iconv_supports($encoding);
$this->assertTrue(is_bool($res));
$this->assertTrue($res === true || $res === false);
$this->assertTrue($res);
@ -753,7 +753,7 @@ class TestMultibyte_String_Functions extends UnitTestCase {
public function testapi_html_entity_supports(){
$encoding='UTF-8';
$res = api_html_entity_supports($encoding);
$res = _api_html_entity_supports($encoding);
$this->assertTrue(is_bool($res));
$this->assertTrue($res === true || $res === false);
//var_dump($res);

Loading…
Cancel
Save