From d477a7ce9d7534b5f76e1629a8fdf8f0302ea241 Mon Sep 17 00:00:00 2001 From: Ivan Tcholakov Date: Sat, 12 Sep 2009 09:00:49 +0300 Subject: [PATCH] Issue #306 - The multibyte string library: The function api_detect_xml_encoding() has been renamed as api_detect_encoding_xml(). New functions have been added: api_convert_encoding_xml(), api_utf8_encode_xml(), api_utf8_decode_xml(). --- .../lib/multibyte_string_functions.lib.php | 44 ++++- ...ultibyte_string_functions_internal.lib.php | 38 ++++ .../multibyte_string_functions.lib.test.php | 179 ++++++++++++------ 3 files changed, 193 insertions(+), 68 deletions(-) diff --git a/main/inc/lib/multibyte_string_functions.lib.php b/main/inc/lib/multibyte_string_functions.lib.php index e6bfe9688d..a201b6df9f 100644 --- a/main/inc/lib/multibyte_string_functions.lib.php +++ b/main/inc/lib/multibyte_string_functions.lib.php @@ -8,8 +8,9 @@ * @author: Ivan Tcholakov, ivantcholakov@gmail.com * October 2008 - initial implementation. * May 2009 - refactoring and minor corrections have been implemented. - * August 2009 - PCRE-related functions have been added, - * dependancy on mbstring extension has been removed. + * August 2009 - PCRE-related functions have been added, + * dependancy on mbstring extension has been removed. + * September 2009 - Encoding conversion functions for XML have been added. * @package dokeos.library * ============================================================================== */ @@ -541,6 +542,39 @@ function api_transliterate($string, $unknown = '?', $from_encoding = null) { return $result; } +/** + * Converts character encoding of a xml-formatted text. If inside the text the encoding is declared, it is modified accordingly. + * @param string $string The text being converted. + * @param string $to_encoding The encoding that text is being converted to. + * @param string $from_encoding (optional) The encoding that text is being converted from. If it is omited, it is tried to be detected then. + * @return string Returns the converted xml-text. + */ +function api_convert_encoding_xml($string, $to_encoding, $from_encoding = null) { + return _api_convert_encoding_xml($string, $to_encoding, $from_encoding); +} + +/** + * Converts character encoding of a xml-formatted text into UTF-8. If inside the text the encoding is declared, it is set to UTF-8. + * @param string $string The text being converted. + * @param string $from_encoding (optional) The encoding that text is being converted from. If it is omited, it is tried to be detected then. + * @return string Returns the converted xml-text. + */ +function api_utf8_encode_xml($string, $from_encoding = null) { + return _api_convert_encoding_xml($string, 'UTF-8', $from_encoding); +} + +/** + * Converts character encoding of a xml-formatted text from UTF-8 into a specified encoding. If inside the text the encoding is declared, it is modified accordingly. + * @param string $string The text being converted. + * @param string $to_encoding (optional) The encoding that text is being converted to. If it is omited, the platform character set is assumed. + * @return string Returns the converted xml-text. + */ +function api_utf8_decode_xml($string, $to_encoding = null) { + if (empty($to_encoding)) { + $to_encoding = _api_mb_internal_encoding(); + } + return _api_convert_encoding_xml($string, $to_encoding, 'UTF-8'); +} /** * ---------------------------------------------------------------------------- @@ -2549,11 +2583,9 @@ function api_get_non_utf8_encoding($language = null) { * @param string $string The input xml-formatted text. * @param string $default_encoding This is the default encoding to be returned if there is no way the xml-text's encoding to be detected. If it not spesified, the system encoding is assumed then. * @return string Returns the detected encoding. - * Note: The regular expression string has been published by Steve Minutillo. - * @link http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss/ */ -function api_detect_xml_encoding(&$string, $default_encoding = null) { - if (preg_match('//m', $string, $matches)) { +function api_detect_encoding_xml(&$string, $default_encoding = null) { + if (preg_match(_PCRE_XML_ENCODING, $string, $matches)) { return api_refine_encoding_id($matches[1]); } if (api_is_valid_utf8($string)) { diff --git a/main/inc/lib/multibyte_string_functions_internal.lib.php b/main/inc/lib/multibyte_string_functions_internal.lib.php index acfecc5549..e56422f9e3 100644 --- a/main/inc/lib/multibyte_string_functions_internal.lib.php +++ b/main/inc/lib/multibyte_string_functions_internal.lib.php @@ -14,6 +14,18 @@ */ +/** + * ---------------------------------------------------------------------------- + * Internal constants + * ---------------------------------------------------------------------------- + */ + +// A regular expression for accessing declared encoding within xml-formatted text. +// Published by Steve Minutillo, +// http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss/ +define('_PCRE_XML_ENCODING', '//m'); + + /** * ---------------------------------------------------------------------------- * Global variables used by some callback functions @@ -359,6 +371,32 @@ function _api_html_entity_from_unicode($codepoint) { return '&#'.$codepoint.';'; } +/** + * Converts character encoding of a xml-formatted text. If inside the text the encoding is declared, it is modified accordingly. + * @param string $string The text being converted. + * @param string $to_encoding The encoding that text is being converted to. + * @param string $from_encoding (optional) The encoding that text is being converted from. If the value is empty, it is tried to be detected then. + * @return string Returns the converted xml-text. + */ +function _api_convert_encoding_xml(&$string, $to_encoding, $from_encoding) { + if (empty($from_encoding)) { + $from_encoding = api_detect_encoding_xml($string); + } + global $_api_encoding; + $_api_encoding = api_refine_encoding_id($to_encoding); + return api_convert_encoding(preg_replace_callback(_PCRE_XML_ENCODING, '_api_convert_encoding_xml_callback', $string), $to_encoding, $from_encoding); +} + +/** + * A callback for serving the function _api_convert_encoding_xml(). + * @param array $matches Input array of matches corresponding to the xml-declaration. + * @return string Returns the xml-declaration with modified encoding. + */ +function _api_convert_encoding_xml_callback($matches) { + global $_api_encoding; + return str_replace($matches[1], $_api_encoding, $matches[0]); +} + /** * ---------------------------------------------------------------------------- diff --git a/tests/main/inc/lib/multibyte_string_functions.lib.test.php b/tests/main/inc/lib/multibyte_string_functions.lib.test.php index 0c04365beb..ffc83c1826 100644 --- a/tests/main/inc/lib/multibyte_string_functions.lib.test.php +++ b/tests/main/inc/lib/multibyte_string_functions.lib.test.php @@ -5,7 +5,7 @@ * a common purpose library for supporting multibyte string * aware functions. Only the public API is tested here. * @author Ricardo Rodriguez Salazar, 2009. - * @author Ivan Tcholakov, August 2009. + * @author Ivan Tcholakov, September 2009. * For licensing terms, see /dokeos_license.txt * * Notes: @@ -789,69 +789,28 @@ class TestMultibyte_String_Functions extends UnitTestCase { //var_dump($res); } - public function test_api_detect_xml_encoding() { - $xml1 = << - - - username1 - xxx - xxx - xxx - xxx@xx.xx - xxx - xxx - student|teacher - - - - - xxx - xxx - xxx - xxx - xxx - - - - xxx - xxx - xxx - xxx - xxx - xxx - - coursecode1 - coach1 - username1 - username2 - - - - - xxx - xxx - xxx - xxx - xxx - xxx - - coursecode1 - coach1 - username1 - username2 - - - -EOM; + public function test_api_detect_encoding_xml() { + $xml1 = ' + + + username1 + xxx + xxx + xxx + xxx@xx.xx + xxx + xxx + student + + '; // US-ASCII $xml2 = ''.$xml1; $xml3 = ''.$xml1; - $xml4 = str_replace('xxx', 'x'.chr(192).'x', $xml1); // A non-UTF-8 character has been inserted. - $res1 = api_detect_xml_encoding($xml1); - $res2 = api_detect_xml_encoding($xml2); - $res3 = api_detect_xml_encoding($xml3); - $res4 = api_detect_xml_encoding($xml4); - $res5 = api_detect_xml_encoding($xml4, 'windows-1251'); + $xml4 = str_replace('xxx', 'x'.chr(192).'x', $xml1); // A non-UTF-8 character has been inserted. + $res1 = api_detect_encoding_xml($xml1); + $res2 = api_detect_encoding_xml($xml2); + $res3 = api_detect_encoding_xml($xml3); + $res4 = api_detect_encoding_xml($xml4); + $res5 = api_detect_encoding_xml($xml4, 'windows-1251'); $this->assertTrue( $res1 === 'UTF-8' && $res2 === 'ISO-8859-15' @@ -866,6 +825,102 @@ EOM; //var_dump($res5); } + public function test_api_convert_encoding_xml() { + $xml = ' + + + + username1 + xxx + Иван + xxx + xxx@xx.xx + xxx + xxx + student + + '; // UTF-8 + $res1 = api_convert_encoding_xml($xml, 'WINDOWS-1251', 'UTF-8'); + $res2 = api_convert_encoding_xml($xml, 'WINDOWS-1251'); + $res3 = api_convert_encoding_xml($res1, 'UTF-8', 'WINDOWS-1251'); + $res4 = api_convert_encoding_xml($res2, 'UTF-8'); + $this->assertTrue( + $res3 === $xml + && $res4 === $xml + ); + //var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('
', '    '), htmlspecialchars($res1))); + //var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('
', '    '), htmlspecialchars($res2))); + //var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('
', '    '), htmlspecialchars($res3))); + //var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('
', '    '), htmlspecialchars($res4))); + } + + public function test_api_utf8_encode_xml() { + $xml1 = ' + + + + username1 + xxx + Иван + xxx + xxx@xx.xx + xxx + xxx + student + + '; // UTF-8 + $xml2 = ' + + + + username1 + xxx + '.chr(200).chr(226).chr(224).chr(237).' + xxx + xxx@xx.xx + xxx + xxx + student + + '; // WINDOWS-1251 + $res1 = api_utf8_encode_xml($xml2); + $this->assertTrue($res1 === $xml1); + //var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('
', '    '), htmlspecialchars($res1))); + } + + public function test_api_utf8_decode_xml() { + $xml1 = ' + + + + username1 + xxx + Иван + xxx + xxx@xx.xx + xxx + xxx + student + + '; // UTF-8 + $xml2 = ' + + + + username1 + xxx + '.chr(200).chr(226).chr(224).chr(237).' + xxx + xxx@xx.xx + xxx + xxx + student + + '; // WINDOWS-1251 + $res1 = api_utf8_decode_xml($xml1, 'WINDOWS-1251'); + $this->assertTrue($res1 === $xml2); + //var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('
', '    '), htmlspecialchars($res1))); + } /** * ----------------------------------------------------------------------------