From b188da6f326ad6686dc97fe46fe5f8d52e093d7f Mon Sep 17 00:00:00 2001
From: Ivan Tcholakov <ivantcholakov@gmail.com>
Date: Thu, 20 Aug 2009 04:41:25 +0300
Subject: [PATCH] Issue #306 - The multibute string library: Fixing my newly
 made mistakes, logic upgrades, optimizations for speed.

---
 .../lib/multibyte_string_functions.lib.php    | 113 +++++++++++++-----
 ...ultibyte_string_functions_internal.lib.php |  80 ++++++-------
 2 files changed, 120 insertions(+), 73 deletions(-)

diff --git a/main/inc/lib/multibyte_string_functions.lib.php b/main/inc/lib/multibyte_string_functions.lib.php
index 9798da6ce4..861b2b007d 100644
--- a/main/inc/lib/multibyte_string_functions.lib.php
+++ b/main/inc/lib/multibyte_string_functions.lib.php
@@ -425,7 +425,7 @@ function api_str_split($string, $split_length = 1, $encoding = null) {
 			$result[] = @mb_substr($string, $i, $split_length, $encoding);
 		}
 	}
-	elseif (api_iconv_supports($encoding) || api_is_utf8($encoding)) {
+	elseif (api_is_encoding_supported($encoding)) {
 		for ($i = 0, $length = api_strlen($string, $encoding); $i < $length; $i += $split_length) {
 			$result[] = api_substr($string, $i, $split_length, $encoding);
 		}
@@ -454,11 +454,12 @@ function api_stripos($haystack, $needle, $offset = 0, $encoding = null) {
 	if (api_mb_supports($encoding)) {
 		return @mb_stripos($haystack, $needle, $offset, $encoding);
 	}
-	elseif (MBSTRING_INSTALLED && (api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding))) {
-		return api_utf8_decode(@mb_stripos(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $offset, 'UTF-8'), $encoding);
-	}
-	elseif (api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding)) {
-		api_strpos(api_strtolower($haystack, $encoding), api_strtolower($needle, $encoding), $offset, $encoding);
+	elseif (api_is_encoding_supported($encoding)) {
+		if (MBSTRING_INSTALLED) {
+			return api_utf8_decode(@mb_stripos(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $offset, 'UTF-8'), $encoding);
+		} else {
+			return api_strpos(api_strtolower($haystack, $encoding), api_strtolower($needle, $encoding), $offset, $encoding);
+		}
 	}
 	return stripos($haystack, $needle, $offset);
 }
@@ -467,27 +468,40 @@ function api_stripos($haystack, $needle, $offset = 0, $encoding = null) {
  * Finds first occurrence of a string within another, case insensitive.
  * @param string $haystack				The string from which to get the first occurrence.
  * @param string @needle				The string to be found.
- * @param bool $part (optional)			Determines which portion of $haystack this function returns. The default value is FALSE.
+ * @param bool $before_needle (optional)	Determines which portion of $haystack this function returns. The default value is FALSE.
  * @param string $encoding (optional)	The used internally by this function character encoding. If it is omitted, the platform character set will be used by default.
  * @return mixed						Returns the portion of $haystack, or FALSE if $needle is not found.
  * Notes:
- * If $part is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence of $needle.
- * If $part is set to FALSE, the function returns all of $haystack from the first occurrence of $needle to the end.
+ * If $before_needle is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence of $needle.
+ * If $before_needle is set to FALSE, the function returns all of $haystack from the first occurrence of $needle to the end.
  * This function is aimed at replacing the functions stristr() and mb_stristr() for human-language strings.
  * @link http://php.net/manual/en/function.stristr
  * @link http://php.net/manual/en/function.mb-stristr
  */
-function api_stristr($haystack, $needle, $part = false, $encoding = null) {
+function api_stristr($haystack, $needle, $before_needle = false, $encoding = null) {
 	if (empty($encoding)) {
 		$encoding = api_mb_internal_encoding();
 	}
 	if (api_mb_supports($encoding)) {
-		return @mb_stristr($haystack, $needle, $part, $encoding);
+		return @mb_stristr($haystack, $needle, $before_needle, $encoding);
 	}
-	elseif (MBSTRING_INSTALLED && (api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding))) {
-		return api_utf8_decode(@mb_stristr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $part, 'UTF-8'));
+	elseif (MBSTRING_INSTALLED && api_is_encoding_supported($encoding)) {
+		return api_utf8_decode(@mb_stristr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $before_needle, 'UTF-8'));
+	}
+	elseif (api_is_encoding_supported($encoding)) {
+		$result = api_strstr(api_strtolower($haystack, $encoding), api_strtolower($needle, $encoding), $before_needle, $encoding);
+		if ($result === false) {
+			return false;
+		}
+		if ($before_needle) {
+			return api_substr($haystack, 0, api_strlen($result, $encoding), $encoding);
+		}
+		return api_substr($haystack, api_strlen($haystack, $encoding) - api_strlen($result, $encoding), null, $encoding);
+	}
+	if (PHP_VERSION < 5.3) {
+		return stristr($haystack, $needle);
 	}
-	return stristr($haystack, $needle, $part);
+	return stristr($haystack, $needle, $before_needle);
 }
 
 /**
@@ -574,17 +588,17 @@ function api_strpos($haystack, $needle, $offset = 0, $encoding = null) {
  * Finds the last occurrence of a character in a string.
  * @param string $haystack				The string from which to get the last occurrence.
  * @param string $needle				The string which first character is to be found.
- * @param bool $part (optional)			Determines which portion of $haystack this function returns. The default value is FALSE.
+ * @param bool $before_needle (optional)	Determines which portion of $haystack this function returns. The default value is FALSE.
  * @param string $encoding (optional)	The used internally by this function character encoding. If it is omitted, the platform character set will be used by default.
  * @return mixed						Returns the portion of $haystack, or FALSE if the first character from $needle is not found.
  * Notes:
- * If $part is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence.
- * If $part is set to FALSE, the function returns all of $haystack from the first occurrence to the end.
+ * If $before_needle is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence.
+ * If $before_needle is set to FALSE, the function returns all of $haystack from the first occurrence to the end.
  * This function is aimed at replacing the functions strrchr() and mb_strrchr() for human-language strings.
  * @link http://php.net/manual/en/function.strrchr
  * @link http://php.net/manual/en/function.mb-strrchr
  */
-function api_strrchr($haystack, $needle, $part = false, $encoding = null) {
+function api_strrchr($haystack, $needle, $before_needle = false, $encoding = null) {
 	if (empty($encoding)) {
 		$encoding = api_mb_internal_encoding();
 	}
@@ -592,10 +606,10 @@ function api_strrchr($haystack, $needle, $part = false, $encoding = null) {
 		return strrchr($haystack, $needle);
 	}
 	elseif (api_mb_supports($encoding)) {
-		return @mb_strrchr($haystack, $needle, $part, $encoding);
+		return @mb_strrchr($haystack, $needle, $before_needle, $encoding);
 	}
 	elseif (MBSTRING_INSTALLED && (api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding))) {
-		return api_utf8_decode(@mb_strrchr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $part, 'UTF-8'), $encoding);
+		return api_utf8_decode(@mb_strrchr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $before_needle, 'UTF-8'), $encoding);
 	}
 	return strrchr($haystack, $needle);
 }
@@ -651,30 +665,65 @@ function api_strrpos($haystack, $needle, $offset = 0, $encoding = null) {
  * Finds first occurrence of a string within another.
  * @param string $haystack				The string from which to get the first occurrence.
  * @param string @needle				The string to be found.
- * @param bool $part (optional)			Determines which portion of $haystack this function returns. The default value is FALSE.
+ * @param bool $before_needle (optional)	Determines which portion of $haystack this function returns. The default value is FALSE.
  * @param string $encoding (optional)	The used internally by this function character encoding. If it is omitted, the platform character set will be used by default.
  * @return mixed						Returns the portion of $haystack, or FALSE if $needle is not found.
  * Notes:
- * If $part is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence of $needle.
- * If $part is set to FALSE, the function returns all of $haystack from the first occurrence of $needle to the end.
+ * If $before_needle is set to TRUE, the function returns all of $haystack from the beginning to the first occurrence of $needle.
+ * If $before_needle is set to FALSE, the function returns all of $haystack from the first occurrence of $needle to the end.
  * This function is aimed at replacing the functions strstr() and mb_strstr() for human-language strings.
  * @link http://php.net/manual/en/function.strstr
  * @link http://php.net/manual/en/function.mb-strstr
  */
-function api_strstr($haystack, $needle, $part = false, $encoding = null) {
+function api_strstr($haystack, $needle, $before_needle = false, $encoding = null) {
 	if (empty($encoding)) {
 		$encoding = api_mb_internal_encoding();
 	}
+	if (!is_string($needle)) {
+		$needle = (int)$needle;
+		if (api_is_utf8($encoding)) {
+			$needle = _api_utf8_chr($needle);
+		} else {
+			$needle = chr($needle);
+		}
+	}
 	if (_api_is_single_byte_encoding($encoding)) {
-		return strstr($haystack, $needle, $part);
+		// Adding the missing parameter $before_needle to the original function strstr(), PHP_VERSION < 5.3
+		if (!$before_needle) {
+			return strstr($haystack, $needle);
+		}
+		if (PHP_VERSION < 5.3) {
+			$result = explode($needle, $haystack, 2);
+			if ($result === false || count($result) < 2) {
+				return false;
+			}
+			return $result[0];
+		}
+		return strstr($haystack, $needle, $before_needle);
 	}
 	if (api_mb_supports($encoding)) {
-		return @mb_strstr($haystack, $needle, $part, $encoding);
+		return @mb_strstr($haystack, $needle, $before_needle, $encoding);
 	}
-	elseif (MBSTRING_INSTALLED && api_iconv_supports($encoding)) {
-		return api_utf8_decode(@mb_strstr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $part, 'UTF-8'), $encoding);
+	elseif (MBSTRING_INSTALLED && api_is_encoding_supported($encoding)) {
+		$result = @mb_strstr(api_utf8_encode($haystack, $encoding), api_utf8_encode($needle, $encoding), $before_needle, 'UTF-8');
+		if ($result !== false) {
+			return api_utf8_decode($result, $encoding);
+		} else {
+			return false;
+		}
 	}
-	return strstr($haystack, $needle, $part);
+	// Adding the missing parameter $before_needle to the original function strstr(), PHP_VERSION < 5.3
+	if (!$before_needle) {
+		return strstr($haystack, $needle);
+	}
+	if (PHP_VERSION < 5.3) {
+		$result = explode($needle, $haystack, 2);
+		if ($result === false || count($result) < 2) {
+			return false;
+		}
+		return $result[0];
+	}
+	return strstr($haystack, $needle, $before_needle);
 }
 
 /**
@@ -2461,7 +2510,11 @@ function api_iconv_set_encoding($type, $encoding = null) {
  * @return bool				Returns TRUE when the specified encoding is supported, FALSE othewise.
  */
 function api_is_encoding_supported($encoding) {
-	return api_mb_supports($encoding) || api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding);
+	static $supported = array();
+	if (!isset($supported[$encoding])) {
+		$supported[$encoding] = api_mb_supports($encoding) || api_iconv_supports($encoding) || _api_convert_encoding_supports($encoding);
+	}
+	return $supported[$encoding];
 }
 
 /**
diff --git a/main/inc/lib/multibyte_string_functions_internal.lib.php b/main/inc/lib/multibyte_string_functions_internal.lib.php
index d782ca516e..f125b39c0b 100644
--- a/main/inc/lib/multibyte_string_functions_internal.lib.php
+++ b/main/inc/lib/multibyte_string_functions_internal.lib.php
@@ -21,7 +21,6 @@
 function _api_convert_encoding($string, $to_encoding, $from_encoding) {
 	static $character_map = array();
 	static $utf8_like = array('UTF-8', 'US-ASCII');
-	static $unknown = 63; // '?'
 	if (empty($string)) {
 		return $string;
 	}
@@ -56,7 +55,7 @@ function _api_convert_encoding($string, $to_encoding, $from_encoding) {
 				if (isset($character_map[$from]['local'][$ord])) {
 					$codepoints[] = $character_map[$from]['local'][$ord];
 				} else {
-					$codepoints[] = $unknown;
+					$codepoints[] = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard.
 				}
 			} else {
 				$codepoints[] = $ord;
@@ -66,13 +65,12 @@ function _api_convert_encoding($string, $to_encoding, $from_encoding) {
 		$codepoints = _api_utf8_to_unicode($string);
 	}
 	if ($to != 'UTF-8') {
-		$unknown_char = chr($unknown);
 		foreach ($codepoints as $i => &$codepoint) {
 			if ($codepoint > 127) {
 				if (isset($character_map[$from]['local'][$codepoint])) {
 					$codepoint = chr($character_map[$from]['local'][$codepoint]);
 				} else {
-					$codepoint = $unknown_char;
+					$codepoint = '?'; // Unknown character.
 				}
 			} else {
 				$codepoint = chr($codepoint);
@@ -138,16 +136,12 @@ function &_api_parse_character_map($name) {
  * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
  * are not allowed.
  * @param string $string				The UTF-8 encoded string.
- * @param string $unknown (optional)	A US-ASCII character to represent invalid bytes.
  * @return array						Returns an array of unicode code points.
  * @author Henri Sivonen, mailto:hsivonen@iki.fi
  * @link http://hsivonen.iki.fi/php-utf8/
  * @author Ivan Tcholakov, 2009, modifications for the Dokeos LMS.
 */
-function _api_utf8_to_unicode($string, $unknown = '?') {
-	if (!empty($unknown)) {
-		$unknown = ord($unknown[0]);
-	}
+function _api_utf8_to_unicode($string) {
 	$state = 0;			// cached expected number of octets after the current octet
 						// until the beginning of the next UTF8 character sequence
 	$codepoint  = 0;	// cached Unicode character
@@ -204,9 +198,7 @@ function _api_utf8_to_unicode($string, $unknown = '?') {
 				$state = 0;
 				$codepoint = 0;
 				$bytes = 1;
-				if (!empty($unknown)) {
-					$result[] = $unknown;
-				}
+				$result[] = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard.
 				continue ;
 			}
 		} else {
@@ -234,9 +226,7 @@ function _api_utf8_to_unicode($string, $unknown = '?') {
 						$state = 0;
 						$codepoint = 0;
 						$bytes = 1;
-						if (!empty($unknown)) {
-							$result[] = $unknown;
-						}
+						$result[] = 0xFFFD;
 						continue ;
 					}
 					if (0xFEFF != $codepoint) {
@@ -254,9 +244,7 @@ function _api_utf8_to_unicode($string, $unknown = '?') {
 				$state = 0;
 				$codepoint = 0;
 				$bytes = 1;
-				if (!empty($unknown)) {
-					$result[] = $unknown;
-				}
+				$result[] = 0xFFFD;
 			}
 		}
 	}
@@ -264,33 +252,28 @@ function _api_utf8_to_unicode($string, $unknown = '?') {
 }
 
 /**
- * Takes an array of ints representing the Unicode characters and returns 
- * a UTF-8 string. Astral planes are supported ie. the ints in the
- * input can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
- * are not allowed.
- * @param array $array					An array of unicode code points representing a string.
- * @param string $unknown (optional)	A US-ASCII character to represent invalid bytes.
+ * Takes an array of ints representing the Unicode characters and returns a UTF-8 string.
+ * @param array $codepoints				An array of unicode code points representing a string.
  * @return string						Returns a UTF-8 string constructed using the given code points.
- * @author Henri Sivonen, mailto:hsivonen@iki.fi
- * @link http://hsivonen.iki.fi/php-utf8/
- * @author Ivan Tcholakov, 2009, modifications for the Dokeos LMS.
- * @see _api_utf8_from_unicodepoint()
 */
-function _api_utf8_from_unicode($array, $unknown = '?') {
-	foreach ($array as $i => &$codepoint) {
-		$codepoint = _api_utf8_from_unicodepoint($codepoint, $unknown);
-	}
-	return implode($array);
+function _api_utf8_from_unicode($codepoints) {
+	return implode(array_map('_api_utf8_chr', $codepoints));
 }
 
 /**
- * Takes an integer value and returns its correspondent representing the Unicode character.
+ * Takes an integer value (codepoint) and returns its correspondent representing the Unicode character.
+ * Astral planes are supported, ie the intger input can be > 0xFFFF. Occurrances of the BOM are ignored.
+ * Surrogates are not allowed.
  * @param array $array					An array of unicode code points representing a string
- * @param string $unknown (optional)	A US-ASCII character to represent invalid bytes.
  * @return string						Returns the corresponding  UTF-8 character.
+ * @author Henri Sivonen, mailto:hsivonen@iki.fi
+ * @link http://hsivonen.iki.fi/php-utf8/
+ * @author Ivan Tcholakov, 2009, modifications for the Dokeos LMS.
  * @see _api_utf8_from_unicode()
+ * This is a UTF-8 aware version of the function chr().
+ * @link http://php.net/manual/en/function.chr.php
  */
-function _api_utf8_from_unicodepoint($codepoint, $unknown = '?') {
+function _api_utf8_chr($codepoint) {
 	// ASCII range (including control chars)
 	if ( ($codepoint >= 0) && ($codepoint <= 0x007f) ) {
 		$result = chr($codepoint);
@@ -304,7 +287,7 @@ function _api_utf8_from_unicodepoint($codepoint, $unknown = '?') {
 	// Test for illegal surrogates
 	} else if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) {
 		// found a surrogate
-		$result = $unknown;
+		$result = _api_utf8_chr(0xFFFD); // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard.
 	// 3 byte sequence
 	} else if ($codepoint <= 0xffff) {
 		$result = chr(0xe0 | ($codepoint >> 12)) . chr(0x80 | (($codepoint >> 6) & 0x003f)) . chr(0x80 | ($codepoint & 0x003f));
@@ -313,11 +296,27 @@ function _api_utf8_from_unicodepoint($codepoint, $unknown = '?') {
 		$result = chr(0xf0 | ($codepoint >> 18)) . chr(0x80 | (($codepoint >> 12) & 0x3f)) . chr(0x80 | (($codepoint >> 6) & 0x3f)) . chr(0x80 | ($codepoint & 0x3f));
 	} else {
  		// out of range
-		$result = $unknown;
+		$result = _api_utf8_chr(0xFFFD);
 	}
 	return $result;
 }
 
+/**
+ * Takes the first UTF-8 character in a string and returns its codepoint (integer).
+ * @param string $utf8_character	The UTF-8 encoded character.
+ * @return int						Returns: the codepoint; or 0xFFFD (unknown character) when the input string is empty.
+ * This is a UTF-8 aware version of the function ord().
+ * @link http://php.net/manual/en/function.ord.php
+ * Note about a difference with the original funtion ord(): ord('') returns 0.
+ */
+function _api_utf8_ord($utf8_character) {
+	if (empty($utf8_character)) {
+		return 0xFFFD;
+	}
+	$codepoints = _api_utf8_to_unicode($utf8_character);
+	return $codepoints[0];
+}
+
 
 /**
  * ----------------------------------------------------------------------------
@@ -329,7 +328,6 @@ function _api_utf8_from_unicodepoint($codepoint, $unknown = '?') {
 function _api_utf8_get_letter_case_properties($codepoint, $type = 'lower') {
 	static $config = array();
 	static $range = array();
-
 	if (!isset($range[$codepoint])) {
 		if ($codepoint > 128 && $codepoint < 256)  {
 			$range[$codepoint] = '0080_00ff'; // Latin-1 Supplement
@@ -368,7 +366,6 @@ function _api_utf8_get_letter_case_properties($codepoint, $type = 'lower') {
 		} else {
 			$range[$codepoint] = false;
 		}
-
 		if ($range[$codepoint] === false) {
 			return null;
 		}
@@ -379,14 +376,11 @@ function _api_utf8_get_letter_case_properties($codepoint, $type = 'lower') {
 			}
 		}
 	}
-
 	if ($range[$codepoint] === false || !isset($config[$range[$codepoint]])) {
 		return null;
 	}
-
 	$result = array();
 	$count = count($config[$range[$codepoint]]);
-
 	for ($i = 0; $i < $count; $i++) {
 		if ($type === 'lower' && $config[$range[$codepoint]][$i][$type][0] === $codepoint) {
 			$result[] = $config[$range[$codepoint]][$i];