Feature #272 - api_detect_encoding() has been reworked for successfull detection of "broken" UTF-8 texts as UTF-8. A test for this case has been added.

skala
Ivan Tcholakov 15 years ago
parent 8f19fb9fbc
commit 18a568d139
  1. 30
      main/inc/lib/internationalization.lib.php
  2. 11
      tests/main/inc/lib/internationalization.lib.test.php

@ -3481,13 +3481,20 @@ function api_get_valid_encodings() {
return array_merge(array('UTF-8'), $result1, $result2, $result3); return array_merge(array('UTF-8'), $result1, $result2, $result3);
} }
function api_detect_encoding($string) { /**
* Detects encoding of plain text.
* @param string $string The input text.
* @param string $language (optional) The language of the input text, provided if it is known.
* @return string Returns the detected encoding.
*/
function api_detect_encoding($string, $language = null) {
// Testing against valid UTF-8 first.
if (api_is_valid_utf8($string)) { if (api_is_valid_utf8($string)) {
return 'UTF-8'; return 'UTF-8';
} }
// "Broken" UTF-8 texts are to be detected as UTF-8.
$result = null; $result = null;
$delta_points_min = LANGUAGE_DETECT_MAX_DELTA; $delta_points_min = LANGUAGE_DETECT_MAX_DELTA;
// Testing non-UTF-8 encodings.
$encodings = api_get_valid_encodings(); $encodings = api_get_valid_encodings();
foreach ($encodings as & $encoding) { foreach ($encodings as & $encoding) {
if (api_is_encoding_supported($encoding) && !api_is_utf8($encoding)) { if (api_is_encoding_supported($encoding) && !api_is_utf8($encoding)) {
@ -3507,6 +3514,25 @@ function api_detect_encoding($string) {
} }
} }
} }
// "Broken" UTF-8 texts are to be detected as UTF-8.
// This functionality is enabled when language of the text is known.
$language = api_purify_language_id((string)$language);
if (!empty($language)) {
$encoding = 'UTF-8';
$result_array = & _api_compare_n_grams(_api_generate_n_grams(api_substr($string, 0, LANGUAGE_DETECT_MAX_LENGTH, $encoding), $encoding), $encoding);
if (!empty($result_array)) {
list($key, $delta_points) = each($result_array);
if ($delta_points < $delta_points_min) {
$pos = strpos($key, ':');
$result_encoding = api_refine_encoding_id(substr($key, $pos + 1));
$result_language = substr($key, 0, $pos);
if ($language == $result_language && api_is_utf8($result_encoding)) {
$delta_points_min = $delta_points;
$result = $encoding;
}
}
}
}
return $result; return $result;
} }

@ -1348,6 +1348,17 @@ class TestInternationalization extends UnitTestCase {
} }
*/ */
// The second function for testing api_detect_encoding().
public function test_api_detect_encoding_2() {
$string_utf8 = 'Това е тест на български език'; // Bulgarian language, UTF-8
$string_utf8_broken = $string_utf8.chr(198); // Intentionaly broken UTF-8, it should be detected as UTF-8
$res1 = api_detect_encoding($string_utf8, 'bulgarian');
$res2 = api_detect_encoding($string_utf8_broken, 'bulgarian');
$this->assertTrue(api_is_utf8($res1) && api_is_utf8($res2));
//var_dump($res1);
//var_dump($res2);
}
public function test_api_str_getcsv() { public function test_api_str_getcsv() {
$strings = array('FirstName;LastName;Email', 'John;Doe;john.doe@mail.com', '"Иван";\\Чолаков;ivan@mail.com'); $strings = array('FirstName;LastName;Email', 'John;Doe;john.doe@mail.com', '"Иван";\\Чолаков;ivan@mail.com');
$expected_results = array(array('FirstName', 'LastName', 'Email'), array('John', 'Doe', 'john.doe@mail.com'), array('Иван', 'Чолаков', 'ivan@mail.com')); $expected_results = array(array('FirstName', 'LastName', 'Email'), array('John', 'Doe', 'john.doe@mail.com'), array('Иван', 'Чолаков', 'ivan@mail.com'));

Loading…
Cancel
Save