From b188da6f326ad6686dc97fe46fe5f8d52e093d7f Mon Sep 17 00:00:00 2001 From: Ivan Tcholakov Date: Thu, 20 Aug 2009 04:41:25 +0300 Subject: [PATCH] Issue #306 - The multibute string library: Fixing my newly made mistakes, logic upgrades, optimizations for speed. --- .../lib/multibyte_string_functions.lib.php | 113 +++++++++++++----- ...ultibyte_string_functions_internal.lib.php | 80 ++++++------- 2 files changed, 120 insertions(+), 73 deletions(-) diff --git a/main/inc/lib/multibyte_string_functions.lib.php b/main/inc/lib/multibyte_string_functions.lib.php index 9798da6ce4..861b2b007d 100644 --- a/main/inc/lib/multibyte_string_functions.lib.php +++ b/main/inc/lib/multibyte_string_functions.lib.php @@ -425,7 +425,7 @@ function api_str_split($string, $split_length = 1, $encoding = null) { $result[] = @mb_substr($string, $i, $split_length, $encoding); } } - elseif (api_iconv_supports($encoding) || api_is_utf8($encoding)) { + elseif (api_is_encoding_supported($encoding)) { for ($i = 0, $length = api_strlen($string, $encoding); $i < $length; $i += $split_length) { $result[] = api_substr($string, $i, $split_length, $encoding); } @@ -454,11 +454,12 @@ function api_stripos($haystack, $needle, $offset = 0, $encoding = null) { if (api_mb_supports($encoding)) { return @mb_stripos($haystack, $needle, $offset, $encoding); } - elseif (MBSTRING_INSTALLED && (api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding))) { - return api_utf8_decode(@mb_stripos(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $offset, 'UTF-8'), $encoding); - } - elseif (api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding)) { - api_strpos(api_strtolower($haystack, $encoding), api_strtolower($needle, $encoding), $offset, $encoding); + elseif (api_is_encoding_supported($encoding)) { + if (MBSTRING_INSTALLED) { + return api_utf8_decode(@mb_stripos(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $offset, 'UTF-8'), $encoding); + } else { + return api_strpos(api_strtolower($haystack, $encoding), api_strtolower($needle, $encoding), $offset, $encoding); + } } return stripos($haystack, $needle, $offset); } @@ -467,27 +468,40 @@ function api_stripos($haystack, $needle, $offset = 0, $encoding = null) { * Finds first occurrence of a string within another, case insensitive. * @param string $haystack The string from which to get the first occurrence. * @param string @needle The string to be found. - * @param bool $part (optional) Determines which portion of $haystack this function returns. The default value is FALSE. + * @param bool $before_needle (optional) Determines which portion of $haystack this function returns. The default value is FALSE. * @param string $encoding (optional) The used internally by this function character encoding. If it is omitted, the platform character set will be used by default. * @return mixed Returns the portion of $haystack, or FALSE if $needle is not found. * Notes: - * If $part is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence of $needle. - * If $part is set to FALSE, the function returns all of $haystack from the first occurrence of $needle to the end. + * If $before_needle is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence of $needle. + * If $before_needle is set to FALSE, the function returns all of $haystack from the first occurrence of $needle to the end. * This function is aimed at replacing the functions stristr() and mb_stristr() for human-language strings. * @link http://php.net/manual/en/function.stristr * @link http://php.net/manual/en/function.mb-stristr */ -function api_stristr($haystack, $needle, $part = false, $encoding = null) { +function api_stristr($haystack, $needle, $before_needle = false, $encoding = null) { if (empty($encoding)) { $encoding = api_mb_internal_encoding(); } if (api_mb_supports($encoding)) { - return @mb_stristr($haystack, $needle, $part, $encoding); + return @mb_stristr($haystack, $needle, $before_needle, $encoding); } - elseif (MBSTRING_INSTALLED && (api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding))) { - return api_utf8_decode(@mb_stristr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $part, 'UTF-8')); + elseif (MBSTRING_INSTALLED && api_is_encoding_supported($encoding)) { + return api_utf8_decode(@mb_stristr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $before_needle, 'UTF-8')); + } + elseif (api_is_encoding_supported($encoding)) { + $result = api_strstr(api_strtolower($haystack, $encoding), api_strtolower($needle, $encoding), $before_needle, $encoding); + if ($result === false) { + return false; + } + if ($before_needle) { + return api_substr($haystack, 0, api_strlen($result, $encoding), $encoding); + } + return api_substr($haystack, api_strlen($haystack, $encoding) - api_strlen($result, $encoding), null, $encoding); + } + if (PHP_VERSION < 5.3) { + return stristr($haystack, $needle); } - return stristr($haystack, $needle, $part); + return stristr($haystack, $needle, $before_needle); } /** @@ -574,17 +588,17 @@ function api_strpos($haystack, $needle, $offset = 0, $encoding = null) { * Finds the last occurrence of a character in a string. * @param string $haystack The string from which to get the last occurrence. * @param string $needle The string which first character is to be found. - * @param bool $part (optional) Determines which portion of $haystack this function returns. The default value is FALSE. + * @param bool $before_needle (optional) Determines which portion of $haystack this function returns. The default value is FALSE. * @param string $encoding (optional) The used internally by this function character encoding. If it is omitted, the platform character set will be used by default. * @return mixed Returns the portion of $haystack, or FALSE if the first character from $needle is not found. * Notes: - * If $part is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence. - * If $part is set to FALSE, the function returns all of $haystack from the first occurrence to the end. + * If $before_needle is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence. + * If $before_needle is set to FALSE, the function returns all of $haystack from the first occurrence to the end. * This function is aimed at replacing the functions strrchr() and mb_strrchr() for human-language strings. * @link http://php.net/manual/en/function.strrchr * @link http://php.net/manual/en/function.mb-strrchr */ -function api_strrchr($haystack, $needle, $part = false, $encoding = null) { +function api_strrchr($haystack, $needle, $before_needle = false, $encoding = null) { if (empty($encoding)) { $encoding = api_mb_internal_encoding(); } @@ -592,10 +606,10 @@ function api_strrchr($haystack, $needle, $part = false, $encoding = null) { return strrchr($haystack, $needle); } elseif (api_mb_supports($encoding)) { - return @mb_strrchr($haystack, $needle, $part, $encoding); + return @mb_strrchr($haystack, $needle, $before_needle, $encoding); } elseif (MBSTRING_INSTALLED && (api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding))) { - return api_utf8_decode(@mb_strrchr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $part, 'UTF-8'), $encoding); + return api_utf8_decode(@mb_strrchr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $before_needle, 'UTF-8'), $encoding); } return strrchr($haystack, $needle); } @@ -651,30 +665,65 @@ function api_strrpos($haystack, $needle, $offset = 0, $encoding = null) { * Finds first occurrence of a string within another. * @param string $haystack The string from which to get the first occurrence. * @param string @needle The string to be found. - * @param bool $part (optional) Determines which portion of $haystack this function returns. The default value is FALSE. + * @param bool $before_needle (optional) Determines which portion of $haystack this function returns. The default value is FALSE. * @param string $encoding (optional) The used internally by this function character encoding. If it is omitted, the platform character set will be used by default. * @return mixed Returns the portion of $haystack, or FALSE if $needle is not found. * Notes: - * If $part is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence of $needle. - * If $part is set to FALSE, the function returns all of $haystack from the first occurrence of $needle to the end. + * If $before_needle is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence of $needle. + * If $before_needle is set to FALSE, the function returns all of $haystack from the first occurrence of $needle to the end. * This function is aimed at replacing the functions strstr() and mb_strstr() for human-language strings. * @link http://php.net/manual/en/function.strstr * @link http://php.net/manual/en/function.mb-strstr */ -function api_strstr($haystack, $needle, $part = false, $encoding = null) { +function api_strstr($haystack, $needle, $before_needle = false, $encoding = null) { if (empty($encoding)) { $encoding = api_mb_internal_encoding(); } + if (!is_string($needle)) { + $needle = (int)$needle; + if (api_is_utf8($encoding)) { + $needle = _api_utf8_chr($needle); + } else { + $needle = chr($needle); + } + } if (_api_is_single_byte_encoding($encoding)) { - return strstr($haystack, $needle, $part); + // Adding the missing parameter $before_needle to the original function strstr(), PHP_VERSION < 5.3 + if (!$before_needle) { + return strstr($haystack, $needle); + } + if (PHP_VERSION < 5.3) { + $result = explode($needle, $haystack, 2); + if ($result === false || count($result) < 2) { + return false; + } + return $result[0]; + } + return strstr($haystack, $needle, $before_needle); } if (api_mb_supports($encoding)) { - return @mb_strstr($haystack, $needle, $part, $encoding); + return @mb_strstr($haystack, $needle, $before_needle, $encoding); } - elseif (MBSTRING_INSTALLED && api_iconv_supports($encoding)) { - return api_utf8_decode(@mb_strstr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $part, 'UTF-8'), $encoding); + elseif (MBSTRING_INSTALLED && api_is_encoding_supported($encoding)) { + $result = @mb_strstr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $before_needle, 'UTF-8'); + if ($result !== false) { + return api_utf8_decode($result, $encoding); + } else { + return false; + } } - return strstr($haystack, $needle, $part); + // Adding the missing parameter $before_needle to the original function strstr(), PHP_VERSION < 5.3 + if (!$before_needle) { + return strstr($haystack, $needle); + } + if (PHP_VERSION < 5.3) { + $result = explode($needle, $haystack, 2); + if ($result === false || count($result) < 2) { + return false; + } + return $result[0]; + } + return strstr($haystack, $needle, $before_needle); } /** @@ -2461,7 +2510,11 @@ function api_iconv_set_encoding($type, $encoding = null) { * @return bool Returns TRUE when the specified encoding is supported, FALSE othewise. */ function api_is_encoding_supported($encoding) { - return api_mb_supports($encoding) || api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding); + static $supported = array(); + if (!isset($supported[$encoding])) { + $supported[$encoding] = api_mb_supports($encoding) || api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding); + } + return $supported[$encoding]; } /** diff --git a/main/inc/lib/multibyte_string_functions_internal.lib.php b/main/inc/lib/multibyte_string_functions_internal.lib.php index d782ca516e..f125b39c0b 100644 --- a/main/inc/lib/multibyte_string_functions_internal.lib.php +++ b/main/inc/lib/multibyte_string_functions_internal.lib.php @@ -21,7 +21,6 @@ function _api_convert_encoding($string, $to_encoding, $from_encoding) { static $character_map = array(); static $utf8_like = array('UTF-8', 'US-ASCII'); - static $unknown = 63; // '?' if (empty($string)) { return $string; } @@ -56,7 +55,7 @@ function _api_convert_encoding($string, $to_encoding, $from_encoding) { if (isset($character_map[$from]['local'][$ord])) { $codepoints[] = $character_map[$from]['local'][$ord]; } else { - $codepoints[] = $unknown; + $codepoints[] = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard. } } else { $codepoints[] = $ord; @@ -66,13 +65,12 @@ function _api_convert_encoding($string, $to_encoding, $from_encoding) { $codepoints = _api_utf8_to_unicode($string); } if ($to != 'UTF-8') { - $unknown_char = chr($unknown); foreach ($codepoints as $i => &$codepoint) { if ($codepoint > 127) { if (isset($character_map[$from]['local'][$codepoint])) { $codepoint = chr($character_map[$from]['local'][$codepoint]); } else { - $codepoint = $unknown_char; + $codepoint = '?'; // Unknown character. } } else { $codepoint = chr($codepoint); @@ -138,16 +136,12 @@ function &_api_parse_character_map($name) { * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates * are not allowed. * @param string $string The UTF-8 encoded string. - * @param string $unknown (optional) A US-ASCII character to represent invalid bytes. * @return array Returns an array of unicode code points. * @author Henri Sivonen, mailto:hsivonen@iki.fi * @link http://hsivonen.iki.fi/php-utf8/ * @author Ivan Tcholakov, 2009, modifications for the Dokeos LMS. */ -function _api_utf8_to_unicode($string, $unknown = '?') { - if (!empty($unknown)) { - $unknown = ord($unknown[0]); - } +function _api_utf8_to_unicode($string) { $state = 0; // cached expected number of octets after the current octet // until the beginning of the next UTF8 character sequence $codepoint = 0; // cached Unicode character @@ -204,9 +198,7 @@ function _api_utf8_to_unicode($string, $unknown = '?') { $state = 0; $codepoint = 0; $bytes = 1; - if (!empty($unknown)) { - $result[] = $unknown; - } + $result[] = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard. continue ; } } else { @@ -234,9 +226,7 @@ function _api_utf8_to_unicode($string, $unknown = '?') { $state = 0; $codepoint = 0; $bytes = 1; - if (!empty($unknown)) { - $result[] = $unknown; - } + $result[] = 0xFFFD; continue ; } if (0xFEFF != $codepoint) { @@ -254,9 +244,7 @@ function _api_utf8_to_unicode($string, $unknown = '?') { $state = 0; $codepoint = 0; $bytes = 1; - if (!empty($unknown)) { - $result[] = $unknown; - } + $result[] = 0xFFFD; } } } @@ -264,33 +252,28 @@ function _api_utf8_to_unicode($string, $unknown = '?') { } /** - * Takes an array of ints representing the Unicode characters and returns - * a UTF-8 string. Astral planes are supported ie. the ints in the - * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates - * are not allowed. - * @param array $array An array of unicode code points representing a string. - * @param string $unknown (optional) A US-ASCII character to represent invalid bytes. + * Takes an array of ints representing the Unicode characters and returns a UTF-8 string. + * @param array $codepoints An array of unicode code points representing a string. * @return string Returns a UTF-8 string constructed using the given code points. - * @author Henri Sivonen, mailto:hsivonen@iki.fi - * @link http://hsivonen.iki.fi/php-utf8/ - * @author Ivan Tcholakov, 2009, modifications for the Dokeos LMS. - * @see _api_utf8_from_unicodepoint() */ -function _api_utf8_from_unicode($array, $unknown = '?') { - foreach ($array as $i => &$codepoint) { - $codepoint = _api_utf8_from_unicodepoint($codepoint, $unknown); - } - return implode($array); +function _api_utf8_from_unicode($codepoints) { + return implode(array_map('_api_utf8_chr', $codepoints)); } /** - * Takes an integer value and returns its correspondent representing the Unicode character. + * Takes an integer value (codepoint) and returns its correspondent representing the Unicode character. + * Astral planes are supported, ie the intger input can be > 0xFFFF. Occurrances of the BOM are ignored. + * Surrogates are not allowed. * @param array $array An array of unicode code points representing a string - * @param string $unknown (optional) A US-ASCII character to represent invalid bytes. * @return string Returns the corresponding UTF-8 character. + * @author Henri Sivonen, mailto:hsivonen@iki.fi + * @link http://hsivonen.iki.fi/php-utf8/ + * @author Ivan Tcholakov, 2009, modifications for the Dokeos LMS. * @see _api_utf8_from_unicode() + * This is a UTF-8 aware version of the function chr(). + * @link http://php.net/manual/en/function.chr.php */ -function _api_utf8_from_unicodepoint($codepoint, $unknown = '?') { +function _api_utf8_chr($codepoint) { // ASCII range (including control chars) if ( ($codepoint >= 0) && ($codepoint <= 0x007f) ) { $result = chr($codepoint); @@ -304,7 +287,7 @@ function _api_utf8_from_unicodepoint($codepoint, $unknown = '?') { // Test for illegal surrogates } else if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) { // found a surrogate - $result = $unknown; + $result = _api_utf8_chr(0xFFFD); // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard. // 3 byte sequence } else if ($codepoint <= 0xffff) { $result = chr(0xe0 | ($codepoint >> 12)) . chr(0x80 | (($codepoint >> 6) & 0x003f)) . chr(0x80 | ($codepoint & 0x003f)); @@ -313,11 +296,27 @@ function _api_utf8_from_unicodepoint($codepoint, $unknown = '?') { $result = chr(0xf0 | ($codepoint >> 18)) . chr(0x80 | (($codepoint >> 12) & 0x3f)) . chr(0x80 | (($codepoint >> 6) & 0x3f)) . chr(0x80 | ($codepoint & 0x3f)); } else { // out of range - $result = $unknown; + $result = _api_utf8_chr(0xFFFD); } return $result; } +/** + * Takes the first UTF-8 character in a string and returns its codepoint (integer). + * @param string $utf8_character The UTF-8 encoded character. + * @return int Returns: the codepoint; or 0xFFFD (unknown character) when the input string is empty. + * This is a UTF-8 aware version of the function ord(). + * @link http://php.net/manual/en/function.ord.php + * Note about a difference with the original funtion ord(): ord('') returns 0. + */ +function _api_utf8_ord($utf8_character) { + if (empty($utf8_character)) { + return 0xFFFD; + } + $codepoints = _api_utf8_to_unicode($utf8_character); + return $codepoints[0]; +} + /** * ---------------------------------------------------------------------------- @@ -329,7 +328,6 @@ function _api_utf8_from_unicodepoint($codepoint, $unknown = '?') { function _api_utf8_get_letter_case_properties($codepoint, $type = 'lower') { static $config = array(); static $range = array(); - if (!isset($range[$codepoint])) { if ($codepoint > 128 && $codepoint < 256) { $range[$codepoint] = '0080_00ff'; // Latin-1 Supplement @@ -368,7 +366,6 @@ function _api_utf8_get_letter_case_properties($codepoint, $type = 'lower') { } else { $range[$codepoint] = false; } - if ($range[$codepoint] === false) { return null; } @@ -379,14 +376,11 @@ function _api_utf8_get_letter_case_properties($codepoint, $type = 'lower') { } } } - if ($range[$codepoint] === false || !isset($config[$range[$codepoint]])) { return null; } - $result = array(); $count = count($config[$range[$codepoint]]); - for ($i = 0; $i < $count; $i++) { if ($type === 'lower' && $config[$range[$codepoint]][$i][$type][0] === $codepoint) { $result[] = $config[$range[$codepoint]][$i];