diff --git a/main/glossary/index.php b/main/glossary/index.php index e774e76f5d..c7c05db623 100644 --- a/main/glossary/index.php +++ b/main/glossary/index.php @@ -80,7 +80,7 @@ if (isset($_GET['action']) && $_GET['action'] == 'export') { $list[] = array ($line[0], $line[1]); } $filename = 'glossary_course_'.api_get_course_id(); - Export::export_table_csv($list,$filename); + Export::export_table_csv_utf8($list, $filename); } Display::display_header($tool_name); @@ -214,18 +214,17 @@ if (api_is_allowed_to_edit(null, true)) { } } } - $data = Import::csv_to_array($_FILES['file']['tmp_name']); - - if (!empty($data)) { - $good = 0; - $bad = 0; - foreach($data as $item) { - if (GlossaryManager::save_glossary(array('glossary_title' => $item['term'], 'glossary_comment' => $item['definition']), false)) - $good++; - else - $bad++; - } - } + //$data = Import::csv_to_array($_FILES['file']['tmp_name']); + $data = Import::csv_reader($_FILES['file']['tmp_name']); + $good = 0; + $bad = 0; + foreach($data as $item) { + if (GlossaryManager::save_glossary(array('glossary_title' => $item['term'], 'glossary_comment' => $item['definition']), false)) + $good++; + else + $bad++; + } + Display::display_confirmation_message (get_lang ("TermsImported") . ':' . $good); if ($bad) diff --git a/main/inc/lib/autoload.class.php b/main/inc/lib/autoload.class.php index 4e229d43a3..5ed846305e 100644 --- a/main/inc/lib/autoload.class.php +++ b/main/inc/lib/autoload.class.php @@ -102,6 +102,7 @@ class Autoload $result['ClosureCompiler'] = '/main/inc/lib/closure_compiler.class.php'; $result['CodeUtilities'] = '/main/inc/lib/code_utilities.class.php'; $result['ConditionalLogin'] = '/main/inc/lib/conditional_login.class.php'; + $result['Converter'] = '/main/inc/lib/system/text/converter.class.php'; $result['Course'] = '/main/coursecopy/classes/Course.class.php'; $result['CourseArchiver'] = '/main/coursecopy/classes/CourseArchiver.class.php'; $result['CourseBuilder'] = '/main/coursecopy/classes/CourseBuilder.class.php'; @@ -116,6 +117,8 @@ class Autoload $result['CourseRestorer'] = '/main/coursecopy/classes/CourseRestorer.class.php'; $result['CourseSelectForm'] = '/main/coursecopy/classes/CourseSelectForm.class.php'; $result['CourseSession'] = '/main/coursecopy/classes/CourseSession.class.php'; + $result['CsvReader'] = '/main/inc/lib/system/io/csv_reader.class.php'; + $result['CsvWriter'] = '/main/inc/lib/system/io/csv_writer.class.php'; $result['CustomPages'] = '/main/inc/lib/custompages.lib.php'; $result['DashboardManager'] = '/main/inc/lib/dashboard.lib.php'; $result['DataForm'] = '/main/gradebook/lib/fe/dataform.class.php'; @@ -128,6 +131,8 @@ class Autoload $result['DokeosIndexer'] = '/main/inc/lib/search/DokeosIndexer.class.php'; $result['DropboxLink'] = '/main/gradebook/lib/be/dropboxlink.class.php'; $result['DummyCourseCreator'] = '/main/coursecopy/classes/DummyCourseCreator.class.php'; + $result['Encoding'] = '/main/inc/lib/system/text/encoding.class.php'; + $result['EncodingConverter'] = '/main/inc/lib/system/text/encoding_converter.class.php'; $result['EntityGenerator'] = '/main/inc/lib/tools/entity_generator.class.php'; $result['EvalForm'] = '/main/gradebook/lib/fe/evalform.class.php'; $result['EvalLink'] = '/main/gradebook/lib/be/evallink.class.php'; @@ -138,6 +143,8 @@ class Autoload $result['ExerciseResult'] = '/main/exercice/exercise_result.class.php'; $result['ExerciseShowFunctions'] = '/main/inc/lib/exercise_show_functions.lib.php'; $result['FileManager'] = '/main/inc/lib/fileManage.lib.php'; + $result['FileReader'] = '/main/inc/lib/system/io/file_reader.class.php'; + $result['FileWriter'] = '/main/inc/lib/system/io/file_writer.class.php'; $result['FillBlanks'] = '/main/exercice/fill_blanks.class.php'; $result['FlatViewDataGenerator'] = '/main/gradebook/lib/flatview_data_generator.class.php'; $result['FlatViewTable'] = '/main/gradebook/lib/fe/flatviewtable.class.php'; @@ -260,6 +267,7 @@ class Autoload $result['MyHorBar'] = '/main/inc/lib/pchart/MyHorBar.class.php'; $result['MySpace'] = '/main/mySpace/myspace.lib.php'; $result['Nanogong'] = '/main/inc/lib/nanogong.lib.php'; + $result['NewMediaForm'] = '/main/media/lib/new_media_form.class.php'; $result['NotebookManager'] = '/main/inc/lib/notebook.lib.php'; $result['Notification'] = '/main/inc/lib/notification.lib.php'; $result['OLE'] = '/main/inc/lib/pear/OLE/OLE.php'; @@ -390,6 +398,9 @@ class Autoload $result['UserManager'] = '/main/inc/lib/usermanager.lib.php'; $result['UserStore'] = '/main/auth/shibboleth/app/model/user.class.php'; $result['UserTable'] = '/main/gradebook/lib/fe/usertable.class.php'; + $result['Utf8'] = '/main/inc/lib/system/text/utf8.class.php'; + $result['Utf8Decoder'] = '/main/inc/lib/system/text/utf8_decoder.class.php'; + $result['Utf8Encoder'] = '/main/inc/lib/system/text/utf8_encoder.class.php'; $result['Wiki'] = '/main/coursecopy/classes/wiki.class.php'; $result['XapianIndexer'] = '/main/inc/lib/search/xapian/XapianIndexer.class.php'; $result['ZombieManager'] = '/main/inc/lib/zombie/zombie_manager.class.php'; @@ -449,6 +460,7 @@ class Autoload $result['xhtdoc'] = '/main/inc/lib/xht.lib.php'; $result['xmddoc'] = '/main/inc/lib/xmd.lib.php'; + return $result; } diff --git a/main/inc/lib/chamilo.class.php b/main/inc/lib/chamilo.class.php index 72b61644a4..9569344ed1 100644 --- a/main/inc/lib/chamilo.class.php +++ b/main/inc/lib/chamilo.class.php @@ -9,7 +9,7 @@ */ class Chamilo { - + public static function name() { //@todo: add version @@ -25,7 +25,6 @@ class Chamilo { return api_get_setting('server_type') == 'production'; } - /** * Returns a full url from local/absolute path and parameters. @@ -39,12 +38,12 @@ class Chamilo { return Uri::url($path, $params, $html); } - + public static function here($params = array(), $html = true) { return Uri::here($params, $html); } - + /** * Application web root */ @@ -62,12 +61,18 @@ class Chamilo { return api_get_path(SYS_PATH); } - + public static function root_courses() { return api_get_path(SYS_COURSE_PATH); } + public static function temp($ext = '') + { + $ext = $ext ? '.' . $ext : ''; + return api_get_path(SYS_ARCHIVE_PATH) . uniqid() . $ext; + } + public static function path($path = '') { $root = self::root(); diff --git a/main/inc/lib/export.lib.inc.php b/main/inc/lib/export.lib.inc.php index e6cb33b894..08cb3cdd49 100644 --- a/main/inc/lib/export.lib.inc.php +++ b/main/inc/lib/export.lib.inc.php @@ -25,9 +25,8 @@ class Export { } /** - * Export tabular data to CSV-file - * @param array $data - * @param string $filename + * + * @deprecated use export_table_csv_utf8 instead */ public static function export_table_csv ($data, $filename = 'export') { $file = api_get_path(SYS_ARCHIVE_PATH).uniqid('').'.csv'; @@ -48,6 +47,28 @@ class Export { DocumentManager :: file_send_for_download($file, true, $filename.'.csv'); return false; } + + /** + * Export tabular data to CSV-file + * @param array $data + * @param string $filename + */ + public static function export_table_csv_utf8 ($data, $filename = 'export') { + if(empty($data)){ + return false; + } + $path = Chamilo::temp(); + $converter = new Utf8Encoder(null, true); + $file = FileWriter::create($path, $converter); + $file = CsvWriter::create($file); + foreach ($data as $row) { + $file->put($row); + } + $file->close(); + DocumentManager :: file_send_for_download($path, true, $filename.'.csv'); + unlink($path); + return false; + } /** * Export tabular data to XLS-file diff --git a/main/inc/lib/import.lib.php b/main/inc/lib/import.lib.php old mode 100755 new mode 100644 index 9a634876a9..d9b02c79a4 --- a/main/inc/lib/import.lib.php +++ b/main/inc/lib/import.lib.php @@ -10,6 +10,11 @@ * @package chamilo.library */ class Import { + + static function csv_reader($path) + { + return new CsvReader($path); + } /** * Reads a CSV-file into an array. The first line of the CSV-file should contain the array-keys. @@ -27,6 +32,9 @@ class Import { * ... * @param string $filename The path to the CSV-file which should be imported. * @return array Returns an array (in the system encoding) that contains all data from the CSV-file. + * + * + * @deprecated use cvs_reader instead */ function csv_to_array($filename) { $result = array(); diff --git a/main/inc/lib/internationalization.lib.php b/main/inc/lib/internationalization.lib.php index ac708bd07b..21c1ede19d 100644 --- a/main/inc/lib/internationalization.lib.php +++ b/main/inc/lib/internationalization.lib.php @@ -3676,171 +3676,11 @@ function api_detect_encoding($string, $language = null) { /** * Checks a string for UTF-8 validity. - * @param string $string The string to be tested/validated. - * @return bool Returns TRUE when the tested string is valid UTF-8 one, FALSE othewise. - * @link http://en.wikipedia.org/wiki/UTF-8 + * + * @deprecated Use Encoding::utf8()->is_valid() instead */ function api_is_valid_utf8(&$string) { - - //return @mb_detect_encoding($string, 'UTF-8', true) == 'UTF-8' ? true : false; - // Ivan Tcholakov, 05-OCT-2008: I do not trust mb_detect_encoding(). I have - // found a string with a single cyrillic letter (single byte), that is - // wrongly detected as UTF-8. Possibly, there would be problems with other - // languages too. An alternative implementation will be used. - - $str = (string)$string; - $len = api_byte_count($str); - $i = 0; - while ($i < $len) { - $byte1 = ord($str[$i++]); // Here the current character begins. Its size is - // determined by the senior bits in the first byte. - - if (($byte1 & 0x80) == 0x00) { // 0xxxxxxx - // & - // 10000000 - // -------- - // 00000000 - // This is s valid character and it contains a single byte. - } - - elseif (($byte1 & 0xE0) == 0xC0) { // 110xxxxx 10xxxxxx - // & & - // 11100000 11000000 - // -------- -------- - // 11000000 10000000 - // The character contains two bytes. - if ($i == $len) { - return false; // Here the string ends unexpectedly. - } - - if (!((ord($str[$i++]) & 0xC0) == 0x80)) - return false; // Invalid second byte, invalid string. - } - - elseif(($byte1 & 0xF0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx - // & & & - // 11110000 11000000 11000000 - // -------- -------- -------- - // 11100000 10000000 10000000 - // This is a character of three bytes. - if ($i == $len) { - return false; // Unexpected end of the string. - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; // Invalid second byte. - } - if ($i == $len) { - return false; // Unexpected end of the string. - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; // Invalid third byte, invalid string. - } - } - - elseif(($byte1 & 0xF8) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - // & & & & - // 11111000 11000000 11000000 11000000 - // -------- -------- -------- -------- - // 11110000 10000000 10000000 10000000 - // This is a character of four bytes. - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - } - - elseif(($byte1 & 0xFC) == 0xF8) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - // & & & & & - // 11111100 11000000 11000000 11000000 11000000 - // -------- -------- -------- -------- -------- - // 11111000 10000000 10000000 10000000 10000000 - // This is a character of five bytes. - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - } - - elseif(($byte1 & 0xFE) == 0xFC) { // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - // & & & & & & - // 11111110 11000000 11000000 11000000 11000000 11000000 - // -------- -------- -------- -------- -------- -------- - // 11111100 10000000 10000000 10000000 10000000 10000000 - // This is a character of six bytes. - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - if ($i == $len) { - return false; - } - if (!((ord($str[$i++]) & 0xC0) == 0x80)) { - return false; - } - } - - else { - return false; // In any other case the character is invalid. - } - // Here the current character is valid, it - // matches to some of the cases above. - // The next character is to be examinated. - } - return true; // Empty strings are valid too. + return Encoding::utf8()->is_valid($string); } /** diff --git a/main/inc/lib/system/io/csv_reader.class.php b/main/inc/lib/system/io/csv_reader.class.php new file mode 100644 index 0000000000..5a0ffd9265 --- /dev/null +++ b/main/inc/lib/system/io/csv_reader.class.php @@ -0,0 +1,170 @@ +$value){ + * echo "$key : $value"; + * } + * } + * + * + * + * @copyright (c) 2012 University of Geneva + * @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html + * @author Laurent Opprecht + */ +class CsvReader implements Iterator +{ + + /** + * + * @param string|FileReader $stream + * @param string $delimiter + * @param string $enclosure + * @return CsvReader + */ + static function create($stream, $delimiter = ';', $enclosure = '"') + { + return new self($stream, $delimiter, $enclosure); + } + + protected $stream = null; + protected $headers = array(); + protected $delimiter = ''; + protected $enclosure = ''; + protected $current = false; + protected $index = -1; + + function __construct($stream, $delimiter = ';', $enclosure = '"') + { + $this->stream = $stream; + $this->delimiter = $delimiter ? substr($delimiter, 0, 1) : ';'; + $this->enclosure = $enclosure ? substr($enclosure, 0, 1) : '"'; + } + + function get_delimiter() + { + return $this->delimiter; + } + + function get_enclosure() + { + return $this->enclosure; + } + + function headers() + { + return $this->headers; + } + + /** + * @return FileReader + */ + function stream() + { + if (is_string($this->stream)) { + $this->stream = new FileReader($this->stream); + } + return $this->stream; + } + + protected function decode($line) + { + if (empty($line)) { + return array(); + } + $data = api_str_getcsv($line, $this->get_delimiter(), $this->get_enclosure()); + if ($this->headers) { + $result = array(); + foreach ($data as $index => $value) { + $key = isset($this->headers[$index]) ? $this->headers[$index] : false; + if ($key) { + $result[$key] = $value; + } else { + $result[] = $value; + } + } + } else { + $result = $data; + } + return $result; + } + + /** + * Returns the next non empty line + * + * @return boolean|string + */ + protected function next_line() + { + while (true) { + $line = $this->stream()->next(); + if ($line === false) { + return false; + } else if ($line) { + return $line; + } + } + return false; + } + + public function current() + { + return $this->current; + } + + public function key() + { + return $this->index; + } + + public function next() + { + if (empty($this->headers)) { + $line = $this->next_line(); + $this->headers = $this->decode($line); + } + $line = $this->next_line(); + if ($line) { + $this->current = $this->decode($line); + $this->index++; + } else { + $this->current = false; + } + return $this->current; + } + + public function rewind() + { + $this->stream()->rewind(); + $line = $this->stream()->current(); + if (empty($line)) { + $line = $this->next_line(); + } + $this->headers = $this->decode($line); + $this->index = -1; + $this->next(); + } + + public function valid() + { + return $this->current !== false; + } + + function __clone() + { + $this->stream()->rewind(); + $this->current = false; + $this->index = -1; + $this->headers = array(); + } + +} \ No newline at end of file diff --git a/main/inc/lib/system/io/csv_writer.class.php b/main/inc/lib/system/io/csv_writer.class.php new file mode 100644 index 0000000000..f822e468f4 --- /dev/null +++ b/main/inc/lib/system/io/csv_writer.class.php @@ -0,0 +1,93 @@ +put($headers); + * $writer->put($line_1); + * $writer->put($line_2); + * + * @copyright (c) 2012 University of Geneva + * @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html + * @author Laurent Opprecht + */ +class CsvWriter +{ + + /** + * + * @param string|object $stream + * @return FileWriter + */ + static function create($stream, $delimiter = ';', $enclosure = '"') + { + return new self($stream, $delimiter, $enclosure); + } + + protected $stream = null; + protected $delimiter = ''; + protected $enclosure = ''; + + function __construct($stream, $delimiter = ';', $enclosure = '"') + { + $this->stream = $stream; + $this->delimiter = $delimiter ? substr($delimiter, 0, 1) : ';';; + $this->enclosure = $enclosure ? substr($enclosure, 0, 1) : '"';; + } + + function get_delimiter() + { + return $this->delimiter; + } + + function get_enclosure() + { + return $this->enclosure; + } + + /** + * + * @return FileWriter + */ + protected function stream() + { + if (is_string($this->stream)) { + $this->stream = new FileWriter($this->stream); + } + return $this->stream; + } + + function write($items) + { + $this->put($items); + } + + function writeln($items) + { + $this->put($items); + } + + function put($items) + { + $enclosure = $this->enclosure; + $fields = array(); + foreach ($items as $item) { + $fields[] = $enclosure . str_replace($enclosure, $enclosure . $enclosure, $item) . $enclosure; + } + + $delimiter = $this->delimiter; + $line = implode($delimiter, $fields); + $this->stream()->writeln($line); + } + + function close() + { + if (is_object($this->stream)) { + $this->stream->close(); + } + $this->stream = null; + } + +} \ No newline at end of file diff --git a/main/inc/lib/system/io/file_reader.class.php b/main/inc/lib/system/io/file_reader.class.php new file mode 100644 index 0000000000..0dc9bf399d --- /dev/null +++ b/main/inc/lib/system/io/file_reader.class.php @@ -0,0 +1,182 @@ + + */ +class FileReader implements Iterator +{ + + const EOL = "\n"; + + /** + * + * @param string $path + * @return FileReader + */ + static function create($path, $converter = null) + { + return new self($path, $converter); + } + + /** + * Returns the file encoding + * + * @return Encoding + */ + static function detect_encoding($path) + { + $abstract = array(); + // We assume that 200 lines are enough for encoding detection. + // here we must get at the raw data so we don't use other functions + // it's not possible to read x chars as this would not be safe with utf + // (chars may be split in the middle) + $handle = fopen($path, 'r'); + + $i = 0; + while (($line = fgets($handle)) !== false && $i < 200) { + $i++; + $abstract[] = $line; + } + fclose($handle); + $abstract = implode($abstract); + return Encoding::detect_encoding($abstract); + } + + protected $path = ''; + protected $handle = null; + protected $current = false; + protected $index = -1; + protected $converter = null; + + function __construct($path, $converter = null) + { + if (empty($converter)) { + $encoding = self::detect_encoding($path); + $converter = $encoding->decoder(); + } + $this->path = $path; + $this->converter = $converter; + } + + /** + * + * @return Converter + */ + function get_converter() + { + return $this->converter; + } + + function handle() + { + if (is_null($this->handle)) { + $this->handle = fopen($this->path, 'r'); + } + return $this->handle; + } + + /** + * Read at most $count lines. + * + * @param int $count + * @return array + */ + function read_lines($count) + { + $result; + $i = 0; + foreach ($this as $line) { + if ($i >= $count) { + return $result; + } + $i++; + $result[] = $line; + } + return $result; + } + + function read_line() + { + return $this->next(); + } + + function close() + { + if (is_resource($this->handle)) { + fclose($this->handle); + } + $this->handle = null; + } + + protected function convert($text) + { + return $this->converter->convert($text); + } + + public function current() + { + return $this->current; + } + + public function key() + { + return $this->index; + } + + public function next() + { + $handle = $this->handle(); + if($handle === false) + { + $this->current = false; + return false; + } + $line = fgets($handle); + if ($line !== false) { + $line = rtrim($line, "\r\n"); + $line = $this->convert($line); + $this->index++; + } + $this->current = $line; + return $this->current; + } + + public function rewind() + { + $this->converter->reset(); + if ($handle = $this->handle()) { + rewind($handle); + } + $this->current = false; + $this->index = -1; + $this->next(); + } + + public function valid() + { + return $this->current !== false; + } + + function __clone() + { + $this->handle = null; + $this->current = false; + $this->index = -1; + $this->converter->reset(); + } + +} \ No newline at end of file diff --git a/main/inc/lib/system/io/file_writer.class.php b/main/inc/lib/system/io/file_writer.class.php new file mode 100644 index 0000000000..0a56aa0e72 --- /dev/null +++ b/main/inc/lib/system/io/file_writer.class.php @@ -0,0 +1,81 @@ + + */ +class FileWriter +{ + + /** + * + * @param string $path + * @param Converter $converter + * @return FileWriter + */ + static function create($path, $converter = null) + { + return new self($path, $converter); + } + + const EOL = "\n"; + + protected $path = ''; + protected $handle = null; + protected $converter = null; + + /** + * + * @param string $path + * @param Encoding $encoding + */ + function __construct($path, $converter = null) + { + $this->path = $path; + $this->converter = $converter ? $converter : Encoding::utf8()->encoder(); + } + + /** + * + * @return Converter + */ + function get_converter() + { + return $this->converter; + } + + protected function handle() + { + if (is_null($this->handle)) { + $this->handle = fopen($this->path, 'a+'); + } + return $this->handle; + } + + function write($text) + { + fwrite($this->handle(), $this->convert($text)); + } + + function writeln($text) + { + fwrite($this->handle(), $this->convert($text) . self::EOL); + } + + function close() + { + if (is_resource($this->handle)) { + fclose($this->handle); + } + $this->handle = null; + } + + protected function convert($text) + { + return $this->converter->convert($text); + } + +} \ No newline at end of file diff --git a/main/inc/lib/system/text/converter.class.php b/main/inc/lib/system/text/converter.class.php new file mode 100644 index 0000000000..3579d3f700 --- /dev/null +++ b/main/inc/lib/system/text/converter.class.php @@ -0,0 +1,33 @@ + + */ +class Converter +{ + + /** + * Identity converter. Returns the string with no transformations. + * + * @return Converter + */ + public static function identity() + { + static $result = null; + if(empty($result)) + { + $result = new self(); + } + return $result; + } + + + function convert($string) + { + return $string; + } +} \ No newline at end of file diff --git a/main/inc/lib/system/text/encoding.class.php b/main/inc/lib/system/text/encoding.class.php new file mode 100644 index 0000000000..f72c0eb6b5 --- /dev/null +++ b/main/inc/lib/system/text/encoding.class.php @@ -0,0 +1,158 @@ +decoder(); + * $decoder->convert('text'); + * + * The system encoding is the platform/system/default encoding. This defaults to + * UTF8 but can be changed: + * + * Encoding::system('name'); + * + * Note that Encoding returns to its name when converted to a string. As such it + * can be used in places where a string is expected: + * + * $utf8 = Encoding::Utf8(); + * echo $utf8; + * + * @copyright (c) 2012 University of Geneva + * @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html + * @author Laurent Opprecht + */ +class Encoding +{ + + private static $system = null; + + /** + * Returns encoding for $name. + * + * @param string $name + * @return Encoding + */ + public static function get($name) + { + if (is_object($name)) { + return $name; + } else if (Encoding::utf8()->is($name)) { + return self::utf8(); + } else { + return new self($name); + } + } + + /** + * Returns the Utf8 encoding. + * + * @return Utf8 + */ + public static function utf8() + { + return Utf8::instance(); + } + + /** + * Returns/set the system/default encoding. + * + * @return Encoding + */ + public static function system($value = null) + { + if (is_object($value)) { + self::$system = $value; + } else if (is_string($value)) { + self::$system = self::get($value); + } + + return self::$system ? self::$system : self::utf8(); + } + + /** + * Detect encoding from an abstract. + * + * @param string $abstract + * @return Encoding + */ + public static function detect_encoding($abstract) + { + $encoding_name = api_detect_encoding($abstract); + return self::get($encoding_name); + } + + protected $name = ''; + + protected function __construct($name = '') + { + $this->name = $name; + } + + /** + * The name of the encoding + * + * @return string + */ + function name() + { + return $this->name; + } + + /** + * The Byte Order Mark. + * + * @see http://en.wikipedia.org/wiki/Byte_order_mark + * @return string + */ + function bom() + { + return ''; + } + + /** + * Returns a decoder that convert encoding to another encoding. + * + * @param string|Encoder $to Encoding to convert to, defaults to system encoding + * @return Converter + */ + public function decoder($to = null) + { + $from = $this; + $to = $to ? $to : Encoding::system(); + return EncodingConverter::create($from, $to); + } + + /** + * Returns an encoder that convert from another encoding to this encoding. + * + * @param string|Encoder $from Encoding to convert from, defaults to system encoding. + * @return Converter + */ + public function encoder($from = null) + { + $from = $from ? $from : Encoding::system(); + $to = $this; + return EncodingConverter::create($from, $to); + } + + function __toString() + { + return $this->name(); + } + +} \ No newline at end of file diff --git a/main/inc/lib/system/text/encoding_converter.class.php b/main/inc/lib/system/text/encoding_converter.class.php new file mode 100644 index 0000000000..be240dad85 --- /dev/null +++ b/main/inc/lib/system/text/encoding_converter.class.php @@ -0,0 +1,66 @@ +convert($text); + * + * Note that the create function will returns an identify converter if from and to + * encodings are the same. Reason why the constructor is private. + * + * @copyright (c) 2012 University of Geneva + * @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html + * @author Laurent Opprecht + */ +class EncodingConverter extends Converter +{ + + /** + * + * @param string $from_encoding + * @param string $to_encoding + * + * @return EncodingConverter + */ + public static function create($from_encoding, $to_encoding) + { + $from_encoding = (string) $from_encoding; + $to_encoding = (string) $to_encoding; + if (strtolower($from_encoding) == strtolower($to_encoding)) { + return Converter::identity(); + } else { + new self($from_encoding, $to_encoding); + } + } + + protected $from_encoding; + protected $to_encoding; + + protected function __construct($from_encoding, $to_encoding) + { + $this->from_encoding = $from_encoding; + $this->to_encoding = $to_encoding; + } + + function from_encoding() + { + return $this->from_encoding; + } + + function to_encoding() + { + return $this->to_encoding; + } + + function convert($string) + { + $from = $this->from_encoding; + $to = $this->to_encoding; + if ($from == $to) { + return $string; + } + api_convert_encoding($string, $to, $from); + } + +} \ No newline at end of file diff --git a/main/inc/lib/system/text/utf8.class.php b/main/inc/lib/system/text/utf8.class.php new file mode 100644 index 0000000000..8a7c1ee950 --- /dev/null +++ b/main/inc/lib/system/text/utf8.class.php @@ -0,0 +1,287 @@ + for the Univesity of Geneva + * @author More authors, mentioned in the correpsonding fragments of this source. + */ +class Utf8 extends Encoding +{ + + const PATTERN_NOT_VISIBLE_CHARS = '/[^[:print:]-]/'; //Visible characters and the space character + + /** + * @see http://en.wikipedia.org/wiki/Byte_order_mark + */ + const BOM = "\xEF\xBB\xBF"; + const NAME = 'UTF-8'; + + /** + * + * @return Utf8 + */ + public static function instance() + { + static $result = null; + if (empty($result)) { + $result = new self(); + } + return $result; + } + + /** + * Returns true if encoding is UTF8. + * + * @param string|Encoding $encoding + * @return bool + */ + function is($encoding) + { + $encoding = (string) $encoding; + return strtolower($encoding) == strtolower(self::NAME); + } + + protected function __construct() + { + parent::__construct(self::NAME); + } + + function name() + { + return self::NAME; + } + + function bom() + { + return self::BOM; + } + + /** + * Returns the hexa decimal representation of an utf8 string. Usefull to understand + * what is going on - not printable chars, rare patterns such as e' for é, etc. + * + * @param type $text + * @return string + */ + function to_hex($text) + { + $result = ''; + mb_internal_encoding('utf-8'); + + for ($i = 0, $n = mb_strlen($text); $i < $n; $i++) { + $char = mb_substr($text, $i, 1); + $num = strlen($char); + for ($j = 0; $j < $num; $j++) { + $result .= sprintf('%02x', ord($char[$j])); + } + $result .= ' '; + } + return $result; + } + + /** + * Trim the BOM from an utf-8 string + * + * @param string $text + * @return string + */ + function trim($text) + { + $bom = self::BOM; + if (strlen($text) < strlen($bom)) { + return $text; + } + + if (substr($text, 0, 3) == $bom) { + return substr($text, 3); + } + return $text; + } + + /** + * Checks a string for UTF-8 validity. + * + * @param string $string The string to be tested. + * @return bool Returns TRUE when the tested string is valid UTF-8, FALSE othewise. + * @link http://en.wikipedia.org/wiki/UTF-8 + * @author see internationalization.lib.php + */ + static function is_valid(&$string) + { + + //return @mb_detect_encoding($string, 'UTF-8', true) == 'UTF-8' ? true : false; + // Ivan Tcholakov, 05-OCT-2008: I do not trust mb_detect_encoding(). I have + // found a string with a single cyrillic letter (single byte), that is + // wrongly detected as UTF-8. Possibly, there would be problems with other + // languages too. An alternative implementation will be used. + + $str = (string) $string; + $len = api_byte_count($str); + $i = 0; + while ($i < $len) { + $byte1 = ord($str[$i++]); // Here the current character begins. Its size is + // determined by the senior bits in the first byte. + + if (($byte1 & 0x80) == 0x00) { // 0xxxxxxx + // & + // 10000000 + // -------- + // 00000000 + // This is s valid character and it contains a single byte. + } elseif (($byte1 & 0xE0) == 0xC0) { // 110xxxxx 10xxxxxx + // & & + // 11100000 11000000 + // -------- -------- + // 11000000 10000000 + // The character contains two bytes. + if ($i == $len) { + return false; // Here the string ends unexpectedly. + } + + if (!((ord($str[$i++]) & 0xC0) == 0x80)) + return false; // Invalid second byte, invalid string. + } + + elseif (($byte1 & 0xF0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx + // & & & + // 11110000 11000000 11000000 + // -------- -------- -------- + // 11100000 10000000 10000000 + // This is a character of three bytes. + if ($i == $len) { + return false; // Unexpected end of the string. + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; // Invalid second byte. + } + if ($i == $len) { + return false; // Unexpected end of the string. + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; // Invalid third byte, invalid string. + } + } elseif (($byte1 & 0xF8) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // & & & & + // 11111000 11000000 11000000 11000000 + // -------- -------- -------- -------- + // 11110000 10000000 10000000 10000000 + // This is a character of four bytes. + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + } elseif (($byte1 & 0xFC) == 0xF8) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // & & & & & + // 11111100 11000000 11000000 11000000 11000000 + // -------- -------- -------- -------- -------- + // 11111000 10000000 10000000 10000000 10000000 + // This is a character of five bytes. + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + } elseif (($byte1 & 0xFE) == 0xFC) { // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // & & & & & & + // 11111110 11000000 11000000 11000000 11000000 11000000 + // -------- -------- -------- -------- -------- -------- + // 11111100 10000000 10000000 10000000 10000000 10000000 + // This is a character of six bytes. + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + if ($i == $len) { + return false; + } + if (!((ord($str[$i++]) & 0xC0) == 0x80)) { + return false; + } + } else { + return false; // In any other case the character is invalid. + } + // Here the current character is valid, it + // matches to some of the cases above. + // The next character is to be examinated. + } + return true; // Empty strings are valid too. + } + + /** + * + * @param type $to + * @return Utf8Decoder + */ + public function decoder($to = null) + { + $to = $to ? $to : Encoding::system(); + return new Utf8Decoder($to); + } + + /** + * + * @param type $from + * @return Utf8Encoder + */ + public function encoder($from = null) + { + $from = $from ? $from : Encoding::system(); + return new Utf8Encoder($from); + } + +} \ No newline at end of file diff --git a/main/inc/lib/system/text/utf8_decoder.class.php b/main/inc/lib/system/text/utf8_decoder.class.php new file mode 100644 index 0000000000..04d5cd7857 --- /dev/null +++ b/main/inc/lib/system/text/utf8_decoder.class.php @@ -0,0 +1,54 @@ + + */ +class Utf8Decoder extends Converter +{ + + protected $started = false; + protected $to_encoding; + protected $encoding_converter; + + function __construct($to_encoding = null) + { + $this->to_encoding = $to_encoding ? $to_encoding : Encoding::system(); + $this->encoding_converter = EncodingConverter::create(Utf8::NAME, $this->to_encoding); + $this->reset(); + } + + function from_encoding() + { + return Utf8::NAME; + } + + function to_encoding() + { + return $this->to_encoding; + } + + function reset() + { + $this->started = false; + } + + function convert($string) + { + if (!$this->started) { + $this->started = true; + $string = Utf8::instance()->trim($string); + return $this->encoding_converter->convert($string); + } else { + return $this->encoding_converter->convert($string); + } + return $string; + } + +} \ No newline at end of file diff --git a/main/inc/lib/system/text/utf8_encoder.class.php b/main/inc/lib/system/text/utf8_encoder.class.php new file mode 100644 index 0000000000..aabe921cf6 --- /dev/null +++ b/main/inc/lib/system/text/utf8_encoder.class.php @@ -0,0 +1,72 @@ + + */ +class Utf8Encoder extends Converter +{ + + protected $started = false; + protected $from_encoding; + protected $encoding_converter; + protected $convert_html_entities = false; + + function __construct($from_encoding = null , $convert_html_entities = false) + { + $this->from_encoding = $from_encoding ? $from_encoding : Encoding::system(); + $this->encoding_converter = EncodingConverter::create($this->from_encoding, Utf8::NAME); + $this->convert_html_entities = $convert_html_entities; + $this->reset(); + } + + function from_encoding() + { + return $this->from_encoding; + } + + function to_encoding() + { + return Utf8::NAME; + } + + function get_convert_html_entities() + { + return $this->convert_html_entities; + } + + function reset() + { + $this->started = false; + } + + function convert($string) + { + if ($this->convert_html_entities) { + $string = html_entity_decode($string, ENT_COMPAT, Utf8::NAME); + } + $string = $this->encoding_converter->convert($string); + if (!$this->started) { + $this->started = true; + $string = Utf8::BOM . $string; + } + return $string; + } + +} \ No newline at end of file