#4758 glossaire csv export with international chars do not work

skala
Laurent Opprecht 14 years ago
parent 7b2e9fbbd7
commit f2c93cdd38
  1. 25
      main/glossary/index.php
  2. 12
      main/inc/lib/autoload.class.php
  3. 15
      main/inc/lib/chamilo.class.php
  4. 27
      main/inc/lib/export.lib.inc.php
  5. 8
      main/inc/lib/import.lib.php
  6. 166
      main/inc/lib/internationalization.lib.php
  7. 170
      main/inc/lib/system/io/csv_reader.class.php
  8. 93
      main/inc/lib/system/io/csv_writer.class.php
  9. 182
      main/inc/lib/system/io/file_reader.class.php
  10. 81
      main/inc/lib/system/io/file_writer.class.php
  11. 33
      main/inc/lib/system/text/converter.class.php
  12. 158
      main/inc/lib/system/text/encoding.class.php
  13. 66
      main/inc/lib/system/text/encoding_converter.class.php
  14. 287
      main/inc/lib/system/text/utf8.class.php
  15. 54
      main/inc/lib/system/text/utf8_decoder.class.php
  16. 72
      main/inc/lib/system/text/utf8_encoder.class.php

@ -80,7 +80,7 @@ if (isset($_GET['action']) && $_GET['action'] == 'export') {
$list[] = array ($line[0], $line[1]);
}
$filename = 'glossary_course_'.api_get_course_id();
Export::export_table_csv($list,$filename);
Export::export_table_csv_utf8($list, $filename);
}
Display::display_header($tool_name);
@ -214,18 +214,17 @@ if (api_is_allowed_to_edit(null, true)) {
}
}
}
$data = Import::csv_to_array($_FILES['file']['tmp_name']);
if (!empty($data)) {
$good = 0;
$bad = 0;
foreach($data as $item) {
if (GlossaryManager::save_glossary(array('glossary_title' => $item['term'], 'glossary_comment' => $item['definition']), false))
$good++;
else
$bad++;
}
}
//$data = Import::csv_to_array($_FILES['file']['tmp_name']);
$data = Import::csv_reader($_FILES['file']['tmp_name']);
$good = 0;
$bad = 0;
foreach($data as $item) {
if (GlossaryManager::save_glossary(array('glossary_title' => $item['term'], 'glossary_comment' => $item['definition']), false))
$good++;
else
$bad++;
}
Display::display_confirmation_message (get_lang ("TermsImported") . ':' . $good);
if ($bad)

@ -102,6 +102,7 @@ class Autoload
$result['ClosureCompiler'] = '/main/inc/lib/closure_compiler.class.php';
$result['CodeUtilities'] = '/main/inc/lib/code_utilities.class.php';
$result['ConditionalLogin'] = '/main/inc/lib/conditional_login.class.php';
$result['Converter'] = '/main/inc/lib/system/text/converter.class.php';
$result['Course'] = '/main/coursecopy/classes/Course.class.php';
$result['CourseArchiver'] = '/main/coursecopy/classes/CourseArchiver.class.php';
$result['CourseBuilder'] = '/main/coursecopy/classes/CourseBuilder.class.php';
@ -116,6 +117,8 @@ class Autoload
$result['CourseRestorer'] = '/main/coursecopy/classes/CourseRestorer.class.php';
$result['CourseSelectForm'] = '/main/coursecopy/classes/CourseSelectForm.class.php';
$result['CourseSession'] = '/main/coursecopy/classes/CourseSession.class.php';
$result['CsvReader'] = '/main/inc/lib/system/io/csv_reader.class.php';
$result['CsvWriter'] = '/main/inc/lib/system/io/csv_writer.class.php';
$result['CustomPages'] = '/main/inc/lib/custompages.lib.php';
$result['DashboardManager'] = '/main/inc/lib/dashboard.lib.php';
$result['DataForm'] = '/main/gradebook/lib/fe/dataform.class.php';
@ -128,6 +131,8 @@ class Autoload
$result['DokeosIndexer'] = '/main/inc/lib/search/DokeosIndexer.class.php';
$result['DropboxLink'] = '/main/gradebook/lib/be/dropboxlink.class.php';
$result['DummyCourseCreator'] = '/main/coursecopy/classes/DummyCourseCreator.class.php';
$result['Encoding'] = '/main/inc/lib/system/text/encoding.class.php';
$result['EncodingConverter'] = '/main/inc/lib/system/text/encoding_converter.class.php';
$result['EntityGenerator'] = '/main/inc/lib/tools/entity_generator.class.php';
$result['EvalForm'] = '/main/gradebook/lib/fe/evalform.class.php';
$result['EvalLink'] = '/main/gradebook/lib/be/evallink.class.php';
@ -138,6 +143,8 @@ class Autoload
$result['ExerciseResult'] = '/main/exercice/exercise_result.class.php';
$result['ExerciseShowFunctions'] = '/main/inc/lib/exercise_show_functions.lib.php';
$result['FileManager'] = '/main/inc/lib/fileManage.lib.php';
$result['FileReader'] = '/main/inc/lib/system/io/file_reader.class.php';
$result['FileWriter'] = '/main/inc/lib/system/io/file_writer.class.php';
$result['FillBlanks'] = '/main/exercice/fill_blanks.class.php';
$result['FlatViewDataGenerator'] = '/main/gradebook/lib/flatview_data_generator.class.php';
$result['FlatViewTable'] = '/main/gradebook/lib/fe/flatviewtable.class.php';
@ -260,6 +267,7 @@ class Autoload
$result['MyHorBar'] = '/main/inc/lib/pchart/MyHorBar.class.php';
$result['MySpace'] = '/main/mySpace/myspace.lib.php';
$result['Nanogong'] = '/main/inc/lib/nanogong.lib.php';
$result['NewMediaForm'] = '/main/media/lib/new_media_form.class.php';
$result['NotebookManager'] = '/main/inc/lib/notebook.lib.php';
$result['Notification'] = '/main/inc/lib/notification.lib.php';
$result['OLE'] = '/main/inc/lib/pear/OLE/OLE.php';
@ -390,6 +398,9 @@ class Autoload
$result['UserManager'] = '/main/inc/lib/usermanager.lib.php';
$result['UserStore'] = '/main/auth/shibboleth/app/model/user.class.php';
$result['UserTable'] = '/main/gradebook/lib/fe/usertable.class.php';
$result['Utf8'] = '/main/inc/lib/system/text/utf8.class.php';
$result['Utf8Decoder'] = '/main/inc/lib/system/text/utf8_decoder.class.php';
$result['Utf8Encoder'] = '/main/inc/lib/system/text/utf8_encoder.class.php';
$result['Wiki'] = '/main/coursecopy/classes/wiki.class.php';
$result['XapianIndexer'] = '/main/inc/lib/search/xapian/XapianIndexer.class.php';
$result['ZombieManager'] = '/main/inc/lib/zombie/zombie_manager.class.php';
@ -449,6 +460,7 @@ class Autoload
$result['xhtdoc'] = '/main/inc/lib/xht.lib.php';
$result['xmddoc'] = '/main/inc/lib/xmd.lib.php';
return $result;
}

@ -9,7 +9,7 @@
*/
class Chamilo
{
public static function name()
{
//@todo: add version
@ -25,7 +25,6 @@ class Chamilo
{
return api_get_setting('server_type') == 'production';
}
/**
* Returns a full url from local/absolute path and parameters.
@ -39,12 +38,12 @@ class Chamilo
{
return Uri::url($path, $params, $html);
}
public static function here($params = array(), $html = true)
{
return Uri::here($params, $html);
}
/**
* Application web root
*/
@ -62,12 +61,18 @@ class Chamilo
{
return api_get_path(SYS_PATH);
}
public static function root_courses()
{
return api_get_path(SYS_COURSE_PATH);
}
public static function temp($ext = '')
{
$ext = $ext ? '.' . $ext : '';
return api_get_path(SYS_ARCHIVE_PATH) . uniqid() . $ext;
}
public static function path($path = '')
{
$root = self::root();

@ -25,9 +25,8 @@ class Export {
}
/**
* Export tabular data to CSV-file
* @param array $data
* @param string $filename
*
* @deprecated use export_table_csv_utf8 instead
*/
public static function export_table_csv ($data, $filename = 'export') {
$file = api_get_path(SYS_ARCHIVE_PATH).uniqid('').'.csv';
@ -48,6 +47,28 @@ class Export {
DocumentManager :: file_send_for_download($file, true, $filename.'.csv');
return false;
}
/**
* Export tabular data to CSV-file
* @param array $data
* @param string $filename
*/
public static function export_table_csv_utf8 ($data, $filename = 'export') {
if(empty($data)){
return false;
}
$path = Chamilo::temp();
$converter = new Utf8Encoder(null, true);
$file = FileWriter::create($path, $converter);
$file = CsvWriter::create($file);
foreach ($data as $row) {
$file->put($row);
}
$file->close();
DocumentManager :: file_send_for_download($path, true, $filename.'.csv');
unlink($path);
return false;
}
/**
* Export tabular data to XLS-file

@ -10,6 +10,11 @@
* @package chamilo.library
*/
class Import {
static function csv_reader($path)
{
return new CsvReader($path);
}
/**
* Reads a CSV-file into an array. The first line of the CSV-file should contain the array-keys.
@ -27,6 +32,9 @@ class Import {
* ...
* @param string $filename The path to the CSV-file which should be imported.
* @return array Returns an array (in the system encoding) that contains all data from the CSV-file.
*
*
* @deprecated use cvs_reader instead
*/
function csv_to_array($filename) {
$result = array();

@ -3676,171 +3676,11 @@ function api_detect_encoding($string, $language = null) {
/**
* Checks a string for UTF-8 validity.
* @param string $string The string to be tested/validated.
* @return bool Returns TRUE when the tested string is valid UTF-8 one, FALSE othewise.
* @link http://en.wikipedia.org/wiki/UTF-8
*
* @deprecated Use Encoding::utf8()->is_valid() instead
*/
function api_is_valid_utf8(&$string) {
//return @mb_detect_encoding($string, 'UTF-8', true) == 'UTF-8' ? true : false;
// Ivan Tcholakov, 05-OCT-2008: I do not trust mb_detect_encoding(). I have
// found a string with a single cyrillic letter (single byte), that is
// wrongly detected as UTF-8. Possibly, there would be problems with other
// languages too. An alternative implementation will be used.
$str = (string)$string;
$len = api_byte_count($str);
$i = 0;
while ($i < $len) {
$byte1 = ord($str[$i++]); // Here the current character begins. Its size is
// determined by the senior bits in the first byte.
if (($byte1 & 0x80) == 0x00) { // 0xxxxxxx
// &
// 10000000
// --------
// 00000000
// This is s valid character and it contains a single byte.
}
elseif (($byte1 & 0xE0) == 0xC0) { // 110xxxxx 10xxxxxx
// & &
// 11100000 11000000
// -------- --------
// 11000000 10000000
// The character contains two bytes.
if ($i == $len) {
return false; // Here the string ends unexpectedly.
}
if (!((ord($str[$i++]) & 0xC0) == 0x80))
return false; // Invalid second byte, invalid string.
}
elseif(($byte1 & 0xF0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx
// & & &
// 11110000 11000000 11000000
// -------- -------- --------
// 11100000 10000000 10000000
// This is a character of three bytes.
if ($i == $len) {
return false; // Unexpected end of the string.
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false; // Invalid second byte.
}
if ($i == $len) {
return false; // Unexpected end of the string.
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false; // Invalid third byte, invalid string.
}
}
elseif(($byte1 & 0xF8) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// & & & &
// 11111000 11000000 11000000 11000000
// -------- -------- -------- --------
// 11110000 10000000 10000000 10000000
// This is a character of four bytes.
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
}
elseif(($byte1 & 0xFC) == 0xF8) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// & & & & &
// 11111100 11000000 11000000 11000000 11000000
// -------- -------- -------- -------- --------
// 11111000 10000000 10000000 10000000 10000000
// This is a character of five bytes.
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
}
elseif(($byte1 & 0xFE) == 0xFC) { // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// & & & & & &
// 11111110 11000000 11000000 11000000 11000000 11000000
// -------- -------- -------- -------- -------- --------
// 11111100 10000000 10000000 10000000 10000000 10000000
// This is a character of six bytes.
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
}
else {
return false; // In any other case the character is invalid.
}
// Here the current character is valid, it
// matches to some of the cases above.
// The next character is to be examinated.
}
return true; // Empty strings are valid too.
return Encoding::utf8()->is_valid($string);
}
/**

@ -0,0 +1,170 @@
<?php
/**
* Read cvs data from a stream - string/FileReader.
*
* Returns data as associative arrays (headers are the keys of the array).
* Skip blank lines ?? is it such a good idea?
*
* Usage:
*
* $reader = CsvReader::create('path');
* foreach($reader as $items){
* foreach($items as $key=>$value){
* echo "$key : $value";
* }
* }
*
*
*
* @copyright (c) 2012 University of Geneva
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
* @author Laurent Opprecht <laurent@opprecht.info>
*/
class CsvReader implements Iterator
{
/**
*
* @param string|FileReader $stream
* @param string $delimiter
* @param string $enclosure
* @return CsvReader
*/
static function create($stream, $delimiter = ';', $enclosure = '"')
{
return new self($stream, $delimiter, $enclosure);
}
protected $stream = null;
protected $headers = array();
protected $delimiter = '';
protected $enclosure = '';
protected $current = false;
protected $index = -1;
function __construct($stream, $delimiter = ';', $enclosure = '"')
{
$this->stream = $stream;
$this->delimiter = $delimiter ? substr($delimiter, 0, 1) : ';';
$this->enclosure = $enclosure ? substr($enclosure, 0, 1) : '"';
}
function get_delimiter()
{
return $this->delimiter;
}
function get_enclosure()
{
return $this->enclosure;
}
function headers()
{
return $this->headers;
}
/**
* @return FileReader
*/
function stream()
{
if (is_string($this->stream)) {
$this->stream = new FileReader($this->stream);
}
return $this->stream;
}
protected function decode($line)
{
if (empty($line)) {
return array();
}
$data = api_str_getcsv($line, $this->get_delimiter(), $this->get_enclosure());
if ($this->headers) {
$result = array();
foreach ($data as $index => $value) {
$key = isset($this->headers[$index]) ? $this->headers[$index] : false;
if ($key) {
$result[$key] = $value;
} else {
$result[] = $value;
}
}
} else {
$result = $data;
}
return $result;
}
/**
* Returns the next non empty line
*
* @return boolean|string
*/
protected function next_line()
{
while (true) {
$line = $this->stream()->next();
if ($line === false) {
return false;
} else if ($line) {
return $line;
}
}
return false;
}
public function current()
{
return $this->current;
}
public function key()
{
return $this->index;
}
public function next()
{
if (empty($this->headers)) {
$line = $this->next_line();
$this->headers = $this->decode($line);
}
$line = $this->next_line();
if ($line) {
$this->current = $this->decode($line);
$this->index++;
} else {
$this->current = false;
}
return $this->current;
}
public function rewind()
{
$this->stream()->rewind();
$line = $this->stream()->current();
if (empty($line)) {
$line = $this->next_line();
}
$this->headers = $this->decode($line);
$this->index = -1;
$this->next();
}
public function valid()
{
return $this->current !== false;
}
function __clone()
{
$this->stream()->rewind();
$this->current = false;
$this->index = -1;
$this->headers = array();
}
}

@ -0,0 +1,93 @@
<?php
/**
* Write array data to a stream in CSV format. Usage:
*
* $writer = CsvWriter::create('path');
*
* $writer->put($headers);
* $writer->put($line_1);
* $writer->put($line_2);
*
* @copyright (c) 2012 University of Geneva
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
* @author Laurent Opprecht <laurent@opprecht.info>
*/
class CsvWriter
{
/**
*
* @param string|object $stream
* @return FileWriter
*/
static function create($stream, $delimiter = ';', $enclosure = '"')
{
return new self($stream, $delimiter, $enclosure);
}
protected $stream = null;
protected $delimiter = '';
protected $enclosure = '';
function __construct($stream, $delimiter = ';', $enclosure = '"')
{
$this->stream = $stream;
$this->delimiter = $delimiter ? substr($delimiter, 0, 1) : ';';;
$this->enclosure = $enclosure ? substr($enclosure, 0, 1) : '"';;
}
function get_delimiter()
{
return $this->delimiter;
}
function get_enclosure()
{
return $this->enclosure;
}
/**
*
* @return FileWriter
*/
protected function stream()
{
if (is_string($this->stream)) {
$this->stream = new FileWriter($this->stream);
}
return $this->stream;
}
function write($items)
{
$this->put($items);
}
function writeln($items)
{
$this->put($items);
}
function put($items)
{
$enclosure = $this->enclosure;
$fields = array();
foreach ($items as $item) {
$fields[] = $enclosure . str_replace($enclosure, $enclosure . $enclosure, $item) . $enclosure;
}
$delimiter = $this->delimiter;
$line = implode($delimiter, $fields);
$this->stream()->writeln($line);
}
function close()
{
if (is_object($this->stream)) {
$this->stream->close();
}
$this->stream = null;
}
}

@ -0,0 +1,182 @@
<?php
/**
* Read text from a file. Reader is line oriented and not char oriented.
* The default converter converts from the file encoding - auto-detected - to
* system encoding.
*
* Usage:
*
* $file = FileReader::create('path');
* foreach($file as $line)
* {
* ...
* }
*
* @copyright (c) 2012 University of Geneva
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
* @author Laurent Opprecht <laurent@opprecht.info>
*/
class FileReader implements Iterator
{
const EOL = "\n";
/**
*
* @param string $path
* @return FileReader
*/
static function create($path, $converter = null)
{
return new self($path, $converter);
}
/**
* Returns the file encoding
*
* @return Encoding
*/
static function detect_encoding($path)
{
$abstract = array();
// We assume that 200 lines are enough for encoding detection.
// here we must get at the raw data so we don't use other functions
// it's not possible to read x chars as this would not be safe with utf
// (chars may be split in the middle)
$handle = fopen($path, 'r');
$i = 0;
while (($line = fgets($handle)) !== false && $i < 200) {
$i++;
$abstract[] = $line;
}
fclose($handle);
$abstract = implode($abstract);
return Encoding::detect_encoding($abstract);
}
protected $path = '';
protected $handle = null;
protected $current = false;
protected $index = -1;
protected $converter = null;
function __construct($path, $converter = null)
{
if (empty($converter)) {
$encoding = self::detect_encoding($path);
$converter = $encoding->decoder();
}
$this->path = $path;
$this->converter = $converter;
}
/**
*
* @return Converter
*/
function get_converter()
{
return $this->converter;
}
function handle()
{
if (is_null($this->handle)) {
$this->handle = fopen($this->path, 'r');
}
return $this->handle;
}
/**
* Read at most $count lines.
*
* @param int $count
* @return array
*/
function read_lines($count)
{
$result;
$i = 0;
foreach ($this as $line) {
if ($i >= $count) {
return $result;
}
$i++;
$result[] = $line;
}
return $result;
}
function read_line()
{
return $this->next();
}
function close()
{
if (is_resource($this->handle)) {
fclose($this->handle);
}
$this->handle = null;
}
protected function convert($text)
{
return $this->converter->convert($text);
}
public function current()
{
return $this->current;
}
public function key()
{
return $this->index;
}
public function next()
{
$handle = $this->handle();
if($handle === false)
{
$this->current = false;
return false;
}
$line = fgets($handle);
if ($line !== false) {
$line = rtrim($line, "\r\n");
$line = $this->convert($line);
$this->index++;
}
$this->current = $line;
return $this->current;
}
public function rewind()
{
$this->converter->reset();
if ($handle = $this->handle()) {
rewind($handle);
}
$this->current = false;
$this->index = -1;
$this->next();
}
public function valid()
{
return $this->current !== false;
}
function __clone()
{
$this->handle = null;
$this->current = false;
$this->index = -1;
$this->converter->reset();
}
}

@ -0,0 +1,81 @@
<?php
/**
* Write data to file. Default to UTF8 encoding.
*
* @copyright (c) 2012 University of Geneva
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
* @author Laurent Opprecht <laurent@opprecht.info>
*/
class FileWriter
{
/**
*
* @param string $path
* @param Converter $converter
* @return FileWriter
*/
static function create($path, $converter = null)
{
return new self($path, $converter);
}
const EOL = "\n";
protected $path = '';
protected $handle = null;
protected $converter = null;
/**
*
* @param string $path
* @param Encoding $encoding
*/
function __construct($path, $converter = null)
{
$this->path = $path;
$this->converter = $converter ? $converter : Encoding::utf8()->encoder();
}
/**
*
* @return Converter
*/
function get_converter()
{
return $this->converter;
}
protected function handle()
{
if (is_null($this->handle)) {
$this->handle = fopen($this->path, 'a+');
}
return $this->handle;
}
function write($text)
{
fwrite($this->handle(), $this->convert($text));
}
function writeln($text)
{
fwrite($this->handle(), $this->convert($text) . self::EOL);
}
function close()
{
if (is_resource($this->handle)) {
fclose($this->handle);
}
$this->handle = null;
}
protected function convert($text)
{
return $this->converter->convert($text);
}
}

@ -0,0 +1,33 @@
<?php
/**
* Convert text. Used mostly to convert from one encoding to another.
*
* @copyright (c) 2012 University of Geneva
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
* @author Laurent Opprecht <laurent@opprecht.info>
*/
class Converter
{
/**
* Identity converter. Returns the string with no transformations.
*
* @return Converter
*/
public static function identity()
{
static $result = null;
if(empty($result))
{
$result = new self();
}
return $result;
}
function convert($string)
{
return $string;
}
}

@ -0,0 +1,158 @@
<?php
/**
* Set the system encoding to the plateform encoding.
*
* @todo:
* Note: those lines are here for ease of use only. They should be move away:
*
* 1 first autodetection should be done inside the Encoding class
* 2 this library should not call a chamilo specific function (this should
* be the other way around, chamilo calling the encoding functions)
*/
$plateform_encoding = api_get_system_encoding();
Encoding::system($plateform_encoding);
/**
* Encoding class. Handles text encoding. Usage:
*
* $encoding = Encoding::get('name');
* $decoder = $encoding->decoder();
* $decoder->convert('text');
*
* The system encoding is the platform/system/default encoding. This defaults to
* UTF8 but can be changed:
*
* Encoding::system('name');
*
* Note that Encoding returns to its name when converted to a string. As such it
* can be used in places where a string is expected:
*
* $utf8 = Encoding::Utf8();
* echo $utf8;
*
* @copyright (c) 2012 University of Geneva
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
* @author Laurent Opprecht <laurent@opprecht.info>
*/
class Encoding
{
private static $system = null;
/**
* Returns encoding for $name.
*
* @param string $name
* @return Encoding
*/
public static function get($name)
{
if (is_object($name)) {
return $name;
} else if (Encoding::utf8()->is($name)) {
return self::utf8();
} else {
return new self($name);
}
}
/**
* Returns the Utf8 encoding.
*
* @return Utf8
*/
public static function utf8()
{
return Utf8::instance();
}
/**
* Returns/set the system/default encoding.
*
* @return Encoding
*/
public static function system($value = null)
{
if (is_object($value)) {
self::$system = $value;
} else if (is_string($value)) {
self::$system = self::get($value);
}
return self::$system ? self::$system : self::utf8();
}
/**
* Detect encoding from an abstract.
*
* @param string $abstract
* @return Encoding
*/
public static function detect_encoding($abstract)
{
$encoding_name = api_detect_encoding($abstract);
return self::get($encoding_name);
}
protected $name = '';
protected function __construct($name = '')
{
$this->name = $name;
}
/**
* The name of the encoding
*
* @return string
*/
function name()
{
return $this->name;
}
/**
* The Byte Order Mark.
*
* @see http://en.wikipedia.org/wiki/Byte_order_mark
* @return string
*/
function bom()
{
return '';
}
/**
* Returns a decoder that convert encoding to another encoding.
*
* @param string|Encoder $to Encoding to convert to, defaults to system encoding
* @return Converter
*/
public function decoder($to = null)
{
$from = $this;
$to = $to ? $to : Encoding::system();
return EncodingConverter::create($from, $to);
}
/**
* Returns an encoder that convert from another encoding to this encoding.
*
* @param string|Encoder $from Encoding to convert from, defaults to system encoding.
* @return Converter
*/
public function encoder($from = null)
{
$from = $from ? $from : Encoding::system();
$to = $this;
return EncodingConverter::create($from, $to);
}
function __toString()
{
return $this->name();
}
}

@ -0,0 +1,66 @@
<?php
/**
* Convert text from one encoding to another. Usage:
*
* $converter = EncodingConverter::create($from, $to);
* $converter->convert($text);
*
* Note that the create function will returns an identify converter if from and to
* encodings are the same. Reason why the constructor is private.
*
* @copyright (c) 2012 University of Geneva
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
* @author Laurent Opprecht <laurent@opprecht.info>
*/
class EncodingConverter extends Converter
{
/**
*
* @param string $from_encoding
* @param string $to_encoding
*
* @return EncodingConverter
*/
public static function create($from_encoding, $to_encoding)
{
$from_encoding = (string) $from_encoding;
$to_encoding = (string) $to_encoding;
if (strtolower($from_encoding) == strtolower($to_encoding)) {
return Converter::identity();
} else {
new self($from_encoding, $to_encoding);
}
}
protected $from_encoding;
protected $to_encoding;
protected function __construct($from_encoding, $to_encoding)
{
$this->from_encoding = $from_encoding;
$this->to_encoding = $to_encoding;
}
function from_encoding()
{
return $this->from_encoding;
}
function to_encoding()
{
return $this->to_encoding;
}
function convert($string)
{
$from = $this->from_encoding;
$to = $this->to_encoding;
if ($from == $to) {
return $string;
}
api_convert_encoding($string, $to, $from);
}
}

@ -0,0 +1,287 @@
<?php
/**
* Utf8 encoding class. Provides utility function to deal with UTF8 encoding.
*
* @license see /license.txt
* @author Laurent Opprecht <laurent@opprecht.info> for the Univesity of Geneva
* @author More authors, mentioned in the correpsonding fragments of this source.
*/
class Utf8 extends Encoding
{
const PATTERN_NOT_VISIBLE_CHARS = '/[^[:print:]-]/'; //Visible characters and the space character
/**
* @see http://en.wikipedia.org/wiki/Byte_order_mark
*/
const BOM = "\xEF\xBB\xBF";
const NAME = 'UTF-8';
/**
*
* @return Utf8
*/
public static function instance()
{
static $result = null;
if (empty($result)) {
$result = new self();
}
return $result;
}
/**
* Returns true if encoding is UTF8.
*
* @param string|Encoding $encoding
* @return bool
*/
function is($encoding)
{
$encoding = (string) $encoding;
return strtolower($encoding) == strtolower(self::NAME);
}
protected function __construct()
{
parent::__construct(self::NAME);
}
function name()
{
return self::NAME;
}
function bom()
{
return self::BOM;
}
/**
* Returns the hexa decimal representation of an utf8 string. Usefull to understand
* what is going on - not printable chars, rare patterns such as e' for é, etc.
*
* @param type $text
* @return string
*/
function to_hex($text)
{
$result = '';
mb_internal_encoding('utf-8');
for ($i = 0, $n = mb_strlen($text); $i < $n; $i++) {
$char = mb_substr($text, $i, 1);
$num = strlen($char);
for ($j = 0; $j < $num; $j++) {
$result .= sprintf('%02x', ord($char[$j]));
}
$result .= ' ';
}
return $result;
}
/**
* Trim the BOM from an utf-8 string
*
* @param string $text
* @return string
*/
function trim($text)
{
$bom = self::BOM;
if (strlen($text) < strlen($bom)) {
return $text;
}
if (substr($text, 0, 3) == $bom) {
return substr($text, 3);
}
return $text;
}
/**
* Checks a string for UTF-8 validity.
*
* @param string $string The string to be tested.
* @return bool Returns TRUE when the tested string is valid UTF-8, FALSE othewise.
* @link http://en.wikipedia.org/wiki/UTF-8
* @author see internationalization.lib.php
*/
static function is_valid(&$string)
{
//return @mb_detect_encoding($string, 'UTF-8', true) == 'UTF-8' ? true : false;
// Ivan Tcholakov, 05-OCT-2008: I do not trust mb_detect_encoding(). I have
// found a string with a single cyrillic letter (single byte), that is
// wrongly detected as UTF-8. Possibly, there would be problems with other
// languages too. An alternative implementation will be used.
$str = (string) $string;
$len = api_byte_count($str);
$i = 0;
while ($i < $len) {
$byte1 = ord($str[$i++]); // Here the current character begins. Its size is
// determined by the senior bits in the first byte.
if (($byte1 & 0x80) == 0x00) { // 0xxxxxxx
// &
// 10000000
// --------
// 00000000
// This is s valid character and it contains a single byte.
} elseif (($byte1 & 0xE0) == 0xC0) { // 110xxxxx 10xxxxxx
// & &
// 11100000 11000000
// -------- --------
// 11000000 10000000
// The character contains two bytes.
if ($i == $len) {
return false; // Here the string ends unexpectedly.
}
if (!((ord($str[$i++]) & 0xC0) == 0x80))
return false; // Invalid second byte, invalid string.
}
elseif (($byte1 & 0xF0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx
// & & &
// 11110000 11000000 11000000
// -------- -------- --------
// 11100000 10000000 10000000
// This is a character of three bytes.
if ($i == $len) {
return false; // Unexpected end of the string.
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false; // Invalid second byte.
}
if ($i == $len) {
return false; // Unexpected end of the string.
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false; // Invalid third byte, invalid string.
}
} elseif (($byte1 & 0xF8) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// & & & &
// 11111000 11000000 11000000 11000000
// -------- -------- -------- --------
// 11110000 10000000 10000000 10000000
// This is a character of four bytes.
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
} elseif (($byte1 & 0xFC) == 0xF8) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// & & & & &
// 11111100 11000000 11000000 11000000 11000000
// -------- -------- -------- -------- --------
// 11111000 10000000 10000000 10000000 10000000
// This is a character of five bytes.
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
} elseif (($byte1 & 0xFE) == 0xFC) { // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
// & & & & & &
// 11111110 11000000 11000000 11000000 11000000 11000000
// -------- -------- -------- -------- -------- --------
// 11111100 10000000 10000000 10000000 10000000 10000000
// This is a character of six bytes.
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
if ($i == $len) {
return false;
}
if (!((ord($str[$i++]) & 0xC0) == 0x80)) {
return false;
}
} else {
return false; // In any other case the character is invalid.
}
// Here the current character is valid, it
// matches to some of the cases above.
// The next character is to be examinated.
}
return true; // Empty strings are valid too.
}
/**
*
* @param type $to
* @return Utf8Decoder
*/
public function decoder($to = null)
{
$to = $to ? $to : Encoding::system();
return new Utf8Decoder($to);
}
/**
*
* @param type $from
* @return Utf8Encoder
*/
public function encoder($from = null)
{
$from = $from ? $from : Encoding::system();
return new Utf8Encoder($from);
}
}

@ -0,0 +1,54 @@
<?php
/**
* Convert from Utf8 to another encoding:
*
* - remove BOM
* - change encoding
*
* @copyright (c) 2012 University of Geneva
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
* @author Laurent Opprecht <laurent@opprecht.info>
*/
class Utf8Decoder extends Converter
{
protected $started = false;
protected $to_encoding;
protected $encoding_converter;
function __construct($to_encoding = null)
{
$this->to_encoding = $to_encoding ? $to_encoding : Encoding::system();
$this->encoding_converter = EncodingConverter::create(Utf8::NAME, $this->to_encoding);
$this->reset();
}
function from_encoding()
{
return Utf8::NAME;
}
function to_encoding()
{
return $this->to_encoding;
}
function reset()
{
$this->started = false;
}
function convert($string)
{
if (!$this->started) {
$this->started = true;
$string = Utf8::instance()->trim($string);
return $this->encoding_converter->convert($string);
} else {
return $this->encoding_converter->convert($string);
}
return $string;
}
}

@ -0,0 +1,72 @@
<?php
/**
* Encode from another encoding to UTF8:
*
* - add BOM
* - change encoding
* - convert html entities if turned on
*
* Note:
*
* Convert_html_entities cannot but turned on by default. This would be bad
* for performances but more than anything else it may be perfectly valid to write
* html entities wihtout transformation - i.e. when writing html content.
*
* It may be better to move convert_html_entities to its own converter and to chain
* converters together to achieve the same result.
*
* @copyright (c) 2012 University of Geneva
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html
* @author Laurent Opprecht <laurent@opprecht.info>
*/
class Utf8Encoder extends Converter
{
protected $started = false;
protected $from_encoding;
protected $encoding_converter;
protected $convert_html_entities = false;
function __construct($from_encoding = null , $convert_html_entities = false)
{
$this->from_encoding = $from_encoding ? $from_encoding : Encoding::system();
$this->encoding_converter = EncodingConverter::create($this->from_encoding, Utf8::NAME);
$this->convert_html_entities = $convert_html_entities;
$this->reset();
}
function from_encoding()
{
return $this->from_encoding;
}
function to_encoding()
{
return Utf8::NAME;
}
function get_convert_html_entities()
{
return $this->convert_html_entities;
}
function reset()
{
$this->started = false;
}
function convert($string)
{
if ($this->convert_html_entities) {
$string = html_entity_decode($string, ENT_COMPAT, Utf8::NAME);
}
$string = $this->encoding_converter->convert($string);
if (!$this->started) {
$this->started = true;
$string = Utf8::BOM . $string;
}
return $string;
}
}
Loading…
Cancel
Save