Issue #306 - The multibyte string library: The function api_detect_xml_encoding() has been renamed as api_detect_encoding_xml(). New functions have been added: api_convert_encoding_xml(), api_utf8_encode_xml(), api_utf8_decode_xml().

skala
Ivan Tcholakov 15 years ago
parent 88fffff15f
commit d477a7ce9d
  1. 44
      main/inc/lib/multibyte_string_functions.lib.php
  2. 38
      main/inc/lib/multibyte_string_functions_internal.lib.php
  3. 179
      tests/main/inc/lib/multibyte_string_functions.lib.test.php

@ -8,8 +8,9 @@
* @author: Ivan Tcholakov, ivantcholakov@gmail.com
* October 2008 - initial implementation.
* May 2009 - refactoring and minor corrections have been implemented.
* August 2009 - PCRE-related functions have been added,
* dependancy on mbstring extension has been removed.
* August 2009 - PCRE-related functions have been added,
* dependancy on mbstring extension has been removed.
* September 2009 - Encoding conversion functions for XML have been added.
* @package dokeos.library
* ==============================================================================
*/
@ -541,6 +542,39 @@ function api_transliterate($string, $unknown = '?', $from_encoding = null) {
return $result;
}
/**
* Converts character encoding of a xml-formatted text. If inside the text the encoding is declared, it is modified accordingly.
* @param string $string The text being converted.
* @param string $to_encoding The encoding that text is being converted to.
* @param string $from_encoding (optional) The encoding that text is being converted from. If it is omited, it is tried to be detected then.
* @return string Returns the converted xml-text.
*/
function api_convert_encoding_xml($string, $to_encoding, $from_encoding = null) {
return _api_convert_encoding_xml($string, $to_encoding, $from_encoding);
}
/**
* Converts character encoding of a xml-formatted text into UTF-8. If inside the text the encoding is declared, it is set to UTF-8.
* @param string $string The text being converted.
* @param string $from_encoding (optional) The encoding that text is being converted from. If it is omited, it is tried to be detected then.
* @return string Returns the converted xml-text.
*/
function api_utf8_encode_xml($string, $from_encoding = null) {
return _api_convert_encoding_xml($string, 'UTF-8', $from_encoding);
}
/**
* Converts character encoding of a xml-formatted text from UTF-8 into a specified encoding. If inside the text the encoding is declared, it is modified accordingly.
* @param string $string The text being converted.
* @param string $to_encoding (optional) The encoding that text is being converted to. If it is omited, the platform character set is assumed.
* @return string Returns the converted xml-text.
*/
function api_utf8_decode_xml($string, $to_encoding = null) {
if (empty($to_encoding)) {
$to_encoding = _api_mb_internal_encoding();
}
return _api_convert_encoding_xml($string, $to_encoding, 'UTF-8');
}
/**
* ----------------------------------------------------------------------------
@ -2549,11 +2583,9 @@ function api_get_non_utf8_encoding($language = null) {
* @param string $string The input xml-formatted text.
* @param string $default_encoding This is the default encoding to be returned if there is no way the xml-text's encoding to be detected. If it not spesified, the system encoding is assumed then.
* @return string Returns the detected encoding.
* Note: The regular expression string has been published by Steve Minutillo.
* @link http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss/
*/
function api_detect_xml_encoding(&$string, $default_encoding = null) {
if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $string, $matches)) {
function api_detect_encoding_xml(&$string, $default_encoding = null) {
if (preg_match(_PCRE_XML_ENCODING, $string, $matches)) {
return api_refine_encoding_id($matches[1]);
}
if (api_is_valid_utf8($string)) {

@ -14,6 +14,18 @@
*/
/**
* ----------------------------------------------------------------------------
* Internal constants
* ----------------------------------------------------------------------------
*/
// A regular expression for accessing declared encoding within xml-formatted text.
// Published by Steve Minutillo,
// http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss/
define('_PCRE_XML_ENCODING', '/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m');
/**
* ----------------------------------------------------------------------------
* Global variables used by some callback functions
@ -359,6 +371,32 @@ function _api_html_entity_from_unicode($codepoint) {
return '&#'.$codepoint.';';
}
/**
* Converts character encoding of a xml-formatted text. If inside the text the encoding is declared, it is modified accordingly.
* @param string $string The text being converted.
* @param string $to_encoding The encoding that text is being converted to.
* @param string $from_encoding (optional) The encoding that text is being converted from. If the value is empty, it is tried to be detected then.
* @return string Returns the converted xml-text.
*/
function _api_convert_encoding_xml(&$string, $to_encoding, $from_encoding) {
if (empty($from_encoding)) {
$from_encoding = api_detect_encoding_xml($string);
}
global $_api_encoding;
$_api_encoding = api_refine_encoding_id($to_encoding);
return api_convert_encoding(preg_replace_callback(_PCRE_XML_ENCODING, '_api_convert_encoding_xml_callback', $string), $to_encoding, $from_encoding);
}
/**
* A callback for serving the function _api_convert_encoding_xml().
* @param array $matches Input array of matches corresponding to the xml-declaration.
* @return string Returns the xml-declaration with modified encoding.
*/
function _api_convert_encoding_xml_callback($matches) {
global $_api_encoding;
return str_replace($matches[1], $_api_encoding, $matches[0]);
}
/**
* ----------------------------------------------------------------------------

@ -5,7 +5,7 @@
* a common purpose library for supporting multibyte string
* aware functions. Only the public API is tested here.
* @author Ricardo Rodriguez Salazar, 2009.
* @author Ivan Tcholakov, August 2009.
* @author Ivan Tcholakov, September 2009.
* For licensing terms, see /dokeos_license.txt
*
* Notes:
@ -789,69 +789,28 @@ class TestMultibyte_String_Functions extends UnitTestCase {
//var_dump($res);
}
public function test_api_detect_xml_encoding() {
$xml1 = <<<EOM
<Sessions>
<Users>
<User>
<Username>username1</Username>
<Lastname>xxx</Lastname>
<Firstname>xxx</Firstname>
<Password>xxx</Password>
<Email>xxx@xx.xx</Email>
<OfficialCode>xxx</OfficialCode>
<Phone>xxx</Phone>
<Status>student|teacher</Status>
</User>
</Users>
<Courses>
<Course>
<CourseCode>xxx</CourseCode>
<CourseTeacher>xxx</CourseTeacher>
<CourseLanguage>xxx</CourseLanguage>
<CourseTitle>xxx</CourseTitle>
<CourseDescription>xxx</CourseDescription>
</Course>
</Courses>
<Session>
<SessionName>xxx</SessionName>
<Coach>xxx</Coach>
<DateStart>xxx</DateStart>
<DateEnd>xxx</DateEnd>
<User>xxx</User>
<User>xxx</User>
<Course>
<CourseCode>coursecode1</CourseCode>
<Coach>coach1</Coach>
<User>username1</User>
<User>username2</User>
</Course>
</Session>
<Session>
<SessionName>xxx</SessionName>
<Coach>xxx</Coach>
<DateStart>xxx</DateStart>
<DateEnd>xxx</DateEnd>
<User>xxx</User>
<User>xxx</User>
<Course>
<CourseCode>coursecode1</CourseCode>
<Coach>coach1</Coach>
<User>username1</User>
<User>username2</User>
</Course>
</Session>
</Sessions>
EOM;
public function test_api_detect_encoding_xml() {
$xml1 = '
<Users>
<User>
<Username>username1</Username>
<Lastname>xxx</Lastname>
<Firstname>xxx</Firstname>
<Password>xxx</Password>
<Email>xxx@xx.xx</Email>
<OfficialCode>xxx</OfficialCode>
<Phone>xxx</Phone>
<Status>student</Status>
</User>
</Users>'; // US-ASCII
$xml2 = '<?xml version="1.0" encoding="ISO-8859-15"?>'.$xml1;
$xml3 = '<?xml version="1.0" encoding="utf-8"?>'.$xml1;
$xml4 = str_replace('<Coach>xxx</Coach>', '<Coach>x'.chr(192).'x</Coach>', $xml1); // A non-UTF-8 character has been inserted.
$res1 = api_detect_xml_encoding($xml1);
$res2 = api_detect_xml_encoding($xml2);
$res3 = api_detect_xml_encoding($xml3);
$res4 = api_detect_xml_encoding($xml4);
$res5 = api_detect_xml_encoding($xml4, 'windows-1251');
$xml4 = str_replace('<Lastname>xxx</Lastname>', '<Lastname>x'.chr(192).'x</Lastname>', $xml1); // A non-UTF-8 character has been inserted.
$res1 = api_detect_encoding_xml($xml1);
$res2 = api_detect_encoding_xml($xml2);
$res3 = api_detect_encoding_xml($xml3);
$res4 = api_detect_encoding_xml($xml4);
$res5 = api_detect_encoding_xml($xml4, 'windows-1251');
$this->assertTrue(
$res1 === 'UTF-8'
&& $res2 === 'ISO-8859-15'
@ -866,6 +825,102 @@ EOM;
//var_dump($res5);
}
public function test_api_convert_encoding_xml() {
$xml = '
<?xml version="1.0" encoding="UTF-8"?>
<Users>
<User>
<Username>username1</Username>
<Lastname>xxx</Lastname>
<Firstname>Иван</Firstname>
<Password>xxx</Password>
<Email>xxx@xx.xx</Email>
<OfficialCode>xxx</OfficialCode>
<Phone>xxx</Phone>
<Status>student</Status>
</User>
</Users>'; // UTF-8
$res1 = api_convert_encoding_xml($xml, 'WINDOWS-1251', 'UTF-8');
$res2 = api_convert_encoding_xml($xml, 'WINDOWS-1251');
$res3 = api_convert_encoding_xml($res1, 'UTF-8', 'WINDOWS-1251');
$res4 = api_convert_encoding_xml($res2, 'UTF-8');
$this->assertTrue(
$res3 === $xml
&& $res4 === $xml
);
//var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('<br />', '&nbsp;&nbsp;&nbsp;&nbsp;'), htmlspecialchars($res1)));
//var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('<br />', '&nbsp;&nbsp;&nbsp;&nbsp;'), htmlspecialchars($res2)));
//var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('<br />', '&nbsp;&nbsp;&nbsp;&nbsp;'), htmlspecialchars($res3)));
//var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('<br />', '&nbsp;&nbsp;&nbsp;&nbsp;'), htmlspecialchars($res4)));
}
public function test_api_utf8_encode_xml() {
$xml1 = '
<?xml version="1.0" encoding="UTF-8"?>
<Users>
<User>
<Username>username1</Username>
<Lastname>xxx</Lastname>
<Firstname>Иван</Firstname>
<Password>xxx</Password>
<Email>xxx@xx.xx</Email>
<OfficialCode>xxx</OfficialCode>
<Phone>xxx</Phone>
<Status>student</Status>
</User>
</Users>'; // UTF-8
$xml2 = '
<?xml version="1.0" encoding="WINDOWS-1251"?>
<Users>
<User>
<Username>username1</Username>
<Lastname>xxx</Lastname>
<Firstname>'.chr(200).chr(226).chr(224).chr(237).'</Firstname>
<Password>xxx</Password>
<Email>xxx@xx.xx</Email>
<OfficialCode>xxx</OfficialCode>
<Phone>xxx</Phone>
<Status>student</Status>
</User>
</Users>'; // WINDOWS-1251
$res1 = api_utf8_encode_xml($xml2);
$this->assertTrue($res1 === $xml1);
//var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('<br />', '&nbsp;&nbsp;&nbsp;&nbsp;'), htmlspecialchars($res1)));
}
public function test_api_utf8_decode_xml() {
$xml1 = '
<?xml version="1.0" encoding="UTF-8"?>
<Users>
<User>
<Username>username1</Username>
<Lastname>xxx</Lastname>
<Firstname>Иван</Firstname>
<Password>xxx</Password>
<Email>xxx@xx.xx</Email>
<OfficialCode>xxx</OfficialCode>
<Phone>xxx</Phone>
<Status>student</Status>
</User>
</Users>'; // UTF-8
$xml2 = '
<?xml version="1.0" encoding="WINDOWS-1251"?>
<Users>
<User>
<Username>username1</Username>
<Lastname>xxx</Lastname>
<Firstname>'.chr(200).chr(226).chr(224).chr(237).'</Firstname>
<Password>xxx</Password>
<Email>xxx@xx.xx</Email>
<OfficialCode>xxx</OfficialCode>
<Phone>xxx</Phone>
<Status>student</Status>
</User>
</Users>'; // WINDOWS-1251
$res1 = api_utf8_decode_xml($xml1, 'WINDOWS-1251');
$this->assertTrue($res1 === $xml2);
//var_dump(preg_replace(array('/\r?\n/m', '/\t/m'), array('<br />', '&nbsp;&nbsp;&nbsp;&nbsp;'), htmlspecialchars($res1)));
}
/**
* ----------------------------------------------------------------------------

Loading…
Cancel
Save