, 2009, 2010 * @author More authors, mentioned in the correpsonding fragments of this source * * Note: All functions and data structures here are not to be used directly. * See the file internationalization.lib.php which contains the "public" API. * @package chamilo.library */ /** * Global variables used by some callback functions */ $_api_encoding = null; $_api_collator = null; /** * Appendix to "Language recognition" * Based on the publication: * W. B. Cavnar and J. M. Trenkle. N-gram-based text categorization. * Proceedings of SDAIR-94, 3rd Annual Symposium on Document Analysis * and Information Retrieval, 1994. * @link http://citeseer.ist.psu.edu/cache/papers/cs/810/http:zSzzSzwww.info.unicaen.frzSz~giguetzSzclassifzSzcavnar_trenkle_ngram.pdf/n-gram-based-text.pdf */ /** * Appendix to "Date and time formats" */ /** * Returns an array of translated week days and months, short and normal names. * @param string $language (optional) Language indentificator. If it is omited, the current interface language is assumed. * @return array Returns a multidimensional array with translated week days and months. */ function &_api_get_day_month_names($language = null) { static $date_parts = array(); if (empty($language)) { $language = api_get_interface_language(); } if (!isset($date_parts[$language])) { $week_day = array('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'); $month = array('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'); for ($i = 0; $i < 7; $i++) { $date_parts[$language]['days_short'][] = get_lang($week_day[$i].'Short', '', $language); $date_parts[$language]['days_long'][] = get_lang($week_day[$i].'Long', '', $language); } for ($i = 0; $i < 12; $i++) { $date_parts[$language]['months_short'][] = get_lang($month[$i].'Short', '', $language); $date_parts[$language]['months_long'][] = get_lang($month[$i].'Long', '', $language); } } return $date_parts[$language]; } /** * Returns returns person name convention for a given language. * @param string $language The input language. * @param string $type The type of the requested convention. * It may be 'format' for name order convention or 'sort_by' for name sorting convention. * @return mixed Depending of the requested type, * the returned result may be string or boolean; null is returned on error; */ function _api_get_person_name_convention($language, $type) { static $conventions; $language = api_purify_language_id($language); if (!isset($conventions)) { $file = dirname(__FILE__).'/internationalization_database/name_order_conventions.php'; if (file_exists($file)) { $conventions = include ($file); } else { $conventions = array( 'english' => array( 'format' => 'title first_name last_name', 'sort_by' => 'first_name' ) ); } // Overwrite classic conventions $customConventions = api_get_configuration_value('name_order_conventions'); if (!empty($customConventions)) { foreach ($customConventions as $key => $data) { $conventions[$key] = $data; } } $search1 = array('FIRST_NAME', 'LAST_NAME', 'TITLE'); $replacement1 = array('%F', '%L', '%T'); $search2 = array('first_name', 'last_name', 'title'); $replacement2 = array('%f', '%l', '%t'); foreach (array_keys($conventions) as $key) { $conventions[$key]['format'] = str_replace($search1, $replacement1, $conventions[$key]['format']); $conventions[$key]['format'] = _api_validate_person_name_format(_api_clean_person_name(str_replace('%', ' %', str_ireplace($search2, $replacement2, $conventions[$key]['format'])))); $conventions[$key]['sort_by'] = strtolower($conventions[$key]['sort_by']) != 'last_name' ? true : false; } } switch ($type) { case 'format': return is_string($conventions[$language]['format']) ? $conventions[$language]['format'] : '%t %f %l'; case 'sort_by': return is_bool($conventions[$language]['sort_by']) ? $conventions[$language]['sort_by'] : true; } return null; } /** * Replaces non-valid formats for person names with the default (English) format. * @param string $format The input format to be verified. * @return bool Returns the same format if is is valid, otherwise returns a valid English format. */ function _api_validate_person_name_format($format) { if (empty($format) || stripos($format, '%f') === false || stripos($format, '%l') === false) { return '%t %f %l'; } return $format; } /** * Removes leading, trailing and duplicate whitespace and/or commas in a full person name. * Cleaning is needed for the cases when not all parts of the name are available or when the name is constructed using a "dirty" pattern. * @param string $person_name The input person name. * @return string Returns cleaned person name. */ function _api_clean_person_name($person_name) { return preg_replace(array('/\s+/', '/, ,/', '/,+/', '/^[ ,]/', '/[ ,]$/'), array(' ', ', ', ',', '', ''), $person_name); } /** * Appendix to "Multibyte string conversion functions" */ /** * This is a php-implementation of a function that is similar to mb_convert_encoding() from mbstring extension. * The function converts a given string from one to another character encoding. * @param string $string The string being converted. * @param string $to_encoding The encoding that $string is being converted to. * @param string $from_encoding The encoding that $string is being converted from. * @return string Returns the converted string. */ function _api_convert_encoding(&$string, $to_encoding, $from_encoding) { return mb_convert_encoding($string, $to_encoding, $from_encoding); } /** * This function determines the name of corresponding to a given encoding conversion table. * It is able to deal with some aliases of the encoding. * @param string $encoding The given encoding identificator, for example 'WINDOWS-1252'. * @return string Returns the name of the corresponding conversion table, for the same example - 'CP1252'. */ function _api_get_character_map_name($encoding) { static $character_map_selector; if (!isset($character_map_selector)) { $file = dirname(__FILE__).'/internationalization_database/conversion/character_map_selector.php'; if (file_exists($file)) { $character_map_selector = include ($file); } else { $character_map_selector = array(); } } return isset($character_map_selector[$encoding]) ? $character_map_selector[$encoding] : ''; } /** * Takes an UTF-8 string and returns an array of integer values representing the Unicode characters. * Astral planes are supported ie. the ints in the output can be > 0xFFFF. Occurrances of the BOM are ignored. * Surrogates are not allowed. * @param string $string The UTF-8 encoded string. * @return array Returns an array of unicode code points. * @author Henri Sivonen, mailto:hsivonen@iki.fi * @link http://hsivonen.iki.fi/php-utf8/ * @author Ivan Tcholakov, August 2009, adaptation for the Dokeos LMS. */ function _api_utf8_to_unicode(&$string) { $str = (string)$string; $state = 0; // cached expected number of octets after the current octet // until the beginning of the next UTF8 character sequence $codepoint = 0; // cached Unicode character $bytes = 1; // cached expected number of octets in the current sequence $result = array(); $len = api_byte_count($str); for ($i = 0; $i < $len; $i++) { $byte = ord($str[$i]); if ($state == 0) { // When state is zero we expect either a US-ASCII character or a multi-octet sequence. if (0 == (0x80 & ($byte))) { // US-ASCII, pass straight through. $result[] = $byte; $bytes = 1; } else if (0xC0 == (0xE0 & ($byte))) { // First octet of 2 octet sequence $codepoint = ($byte); $codepoint = ($codepoint & 0x1F) << 6; $state = 1; $bytes = 2; } else if (0xE0 == (0xF0 & ($byte))) { // First octet of 3 octet sequence $codepoint = ($byte); $codepoint = ($codepoint & 0x0F) << 12; $state = 2; $bytes = 3; } else if (0xF0 == (0xF8 & ($byte))) { // First octet of 4 octet sequence $codepoint = ($byte); $codepoint = ($codepoint & 0x07) << 18; $state = 3; $bytes = 4; } else if (0xF8 == (0xFC & ($byte))) { // First octet of 5 octet sequence. // This is illegal because the encoded codepoint must be either // (a) not the shortest form or // (b) outside the Unicode range of 0-0x10FFFF. // Rather than trying to resynchronize, we will carry on until the end // of the sequence and let the later error handling code catch it. $codepoint = ($byte); $codepoint = ($codepoint & 0x03) << 24; $state = 4; $bytes = 5; } else if (0xFC == (0xFE & ($byte))) { // First octet of 6 octet sequence, see comments for 5 octet sequence. $codepoint = ($byte); $codepoint = ($codepoint & 1) << 30; $state = 5; $bytes = 6; } else { // Current octet is neither in the US-ASCII range nor a legal first octet of a multi-octet sequence. $state = 0; $codepoint = 0; $bytes = 1; $result[] = 0xFFFD; // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard. continue ; } } else { // When state is non-zero, we expect a continuation of the multi-octet sequence if (0x80 == (0xC0 & ($byte))) { // Legal continuation. $shift = ($state - 1) * 6; $tmp = $byte; $tmp = ($tmp & 0x0000003F) << $shift; $codepoint |= $tmp; // End of the multi-octet sequence. $codepoint now contains the final Unicode codepoint to be output if (0 == --$state) { // Check for illegal sequences and codepoints. // From Unicode 3.1, non-shortest form is illegal if (((2 == $bytes) && ($codepoint < 0x0080)) || ((3 == $bytes) && ($codepoint < 0x0800)) || ((4 == $bytes) && ($codepoint < 0x10000)) || (4 < $bytes) || // From Unicode 3.2, surrogate characters are illegal (($codepoint & 0xFFFFF800) == 0xD800) || // Codepoints outside the Unicode range are illegal ($codepoint > 0x10FFFF)) { $state = 0; $codepoint = 0; $bytes = 1; $result[] = 0xFFFD; continue ; } if (0xFEFF != $codepoint) { // BOM is legal but we don't want to output it $result[] = $codepoint; } // Initialize UTF8 cache $state = 0; $codepoint = 0; $bytes = 1; } } else { // ((0xC0 & (*in) != 0x80) && (state != 0)) // Incomplete multi-octet sequence. $state = 0; $codepoint = 0; $bytes = 1; $result[] = 0xFFFD; } } } return $result; } /** * Takes an array of Unicode codepoints and returns a UTF-8 string. * @param array $codepoints An array of Unicode codepoints representing a string. * @return string Returns a UTF-8 string constructed using the given codepoints. */ function _api_utf8_from_unicode($codepoints) { return implode(array_map('_api_utf8_chr', $codepoints)); } /** * Takes a codepoint and returns its correspondent UTF-8 encoded character. * Astral planes are supported, ie the intger input can be > 0xFFFF. Occurrances of the BOM are ignored. * Surrogates are not allowed. * @param int $codepoint The Unicode codepoint. * @return string Returns the corresponding UTF-8 character. * @author Henri Sivonen, mailto:hsivonen@iki.fi * @link http://hsivonen.iki.fi/php-utf8/ * @author Ivan Tcholakov, 2009, modifications for the Dokeos LMS. * @see _api_utf8_from_unicode() * This is a UTF-8 aware version of the function chr(). * @link http://php.net/manual/en/function.chr.php */ function _api_utf8_chr($codepoint) { // ASCII range (including control chars) if ( ($codepoint >= 0) && ($codepoint <= 0x007f) ) { $result = chr($codepoint); // 2 byte sequence } else if ($codepoint <= 0x07ff) { $result = chr(0xc0 | ($codepoint >> 6)) . chr(0x80 | ($codepoint & 0x003f)); // Byte order mark (skip) } else if($codepoint == 0xFEFF) { // nop -- zap the BOM $result = ''; // Test for illegal surrogates } else if ($codepoint >= 0xD800 && $codepoint <= 0xDFFF) { // found a surrogate $result = _api_utf8_chr(0xFFFD); // U+FFFD REPLACEMENT CHARACTER is the general substitute character in the Unicode Standard. // 3 byte sequence } else if ($codepoint <= 0xffff) { $result = chr(0xe0 | ($codepoint >> 12)) . chr(0x80 | (($codepoint >> 6) & 0x003f)) . chr(0x80 | ($codepoint & 0x003f)); // 4 byte sequence } else if ($codepoint <= 0x10ffff) { $result = chr(0xf0 | ($codepoint >> 18)) . chr(0x80 | (($codepoint >> 12) & 0x3f)) . chr(0x80 | (($codepoint >> 6) & 0x3f)) . chr(0x80 | ($codepoint & 0x3f)); } else { // out of range $result = _api_utf8_chr(0xFFFD); } return $result; } /** * Appendix to "String comparison" */ /** * A reverse function from php-core function strnatcmp(), performs string comparison in reverse natural (alpha-numerical) order. * @param string $string1 The first string. * @param string $string2 The second string. * @return int Returns 0 if $string1 = $string2; >0 if $string1 < $string2; <0 if $string1 > $string2. */ function _api_strnatrcmp($string1, $string2) { return strnatcmp($string2, $string1); } /** * ICU locales (accessible through intl extension). */ /** * Appendix to "Encoding management functions" */ /** * Sets/Gets internal character encoding of the common string functions within the PHP mbstring extension. * @param string $encoding (optional) When this parameter is given, the function sets the internal encoding. * @return string When $encoding parameter is not given, the function returns the internal encoding. * Note: This function is used in the global initialization script for setting the internal encoding to the platform's character set. * @link http://php.net/manual/en/function.mb-internal-encoding */ function _api_mb_internal_encoding($encoding = null) { return mb_internal_encoding($encoding); } /** * Checks whether the specified encoding is supported by the PHP mbstring extension. * @param string $encoding The specified encoding. * @return bool Returns TRUE when the specified encoding is supported, FALSE othewise. */ function _api_mb_supports($encoding) { static $supported = array(); if (!isset($supported[$encoding])) { if (MBSTRING_INSTALLED) { $supported[$encoding] = api_equal_encodings($encoding, mb_list_encodings(), true); } else { $supported[$encoding] = false; } } return $supported[$encoding]; } /** * Checks whether the specified encoding is supported by the PHP iconv extension. * @param string $encoding The specified encoding. * @return bool Returns TRUE when the specified encoding is supported, FALSE othewise. */ function _api_iconv_supports($encoding) { static $supported = array(); if (!isset($supported[$encoding])) { if (ICONV_INSTALLED) { $enc = api_refine_encoding_id($encoding); if ($enc != 'HTML-ENTITIES') { $test_string = ''; for ($i = 32; $i < 128; $i++) { $test_string .= chr($i); } $supported[$encoding] = (@iconv_strlen($test_string, $enc)) ? true : false; } else { $supported[$encoding] = false; } } else { $supported[$encoding] = false; } } return $supported[$encoding]; } // This function checks whether the function _api_convert_encoding() (the php- // implementation) is able to convert from/to a given encoding. function _api_convert_encoding_supports($encoding) { static $supports = array(); if (!isset($supports[$encoding])) { $supports[$encoding] = _api_get_character_map_name(api_refine_encoding_id($encoding)) != ''; } return $supports[$encoding]; } /** * Checks whether the specified encoding is supported by the html-entitiy related functions. * @param string $encoding The specified encoding. * @return bool Returns TRUE when the specified encoding is supported, FALSE othewise. */ function _api_html_entity_supports($encoding) { static $supports = array(); if (!isset($supports[$encoding])) { // See http://php.net/manual/en/function.htmlentities.php $html_entity_encodings = array( 'ISO-8859-1', 'ISO-8859-15', 'UTF-8', 'CP866', 'CP1251', 'CP1252', 'KOI8-R', 'BIG5', '950', 'GB2312', '936', 'BIG5-HKSCS', 'Shift_JIS', 'SJIS', '932', 'EUC-JP', 'EUCJP' ); $supports[$encoding] = api_equal_encodings($encoding, $html_entity_encodings); } return $supports[$encoding]; }