#4758 glossaire csv export with international chars do not work
parent
7b2e9fbbd7
commit
f2c93cdd38
@ -0,0 +1,170 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Read cvs data from a stream - string/FileReader. |
||||
* |
||||
* Returns data as associative arrays (headers are the keys of the array). |
||||
* Skip blank lines ?? is it such a good idea? |
||||
* |
||||
* Usage: |
||||
* |
||||
* $reader = CsvReader::create('path'); |
||||
* foreach($reader as $items){ |
||||
* foreach($items as $key=>$value){ |
||||
* echo "$key : $value"; |
||||
* } |
||||
* } |
||||
* |
||||
* |
||||
* |
||||
* @copyright (c) 2012 University of Geneva |
||||
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html |
||||
* @author Laurent Opprecht <laurent@opprecht.info> |
||||
*/ |
||||
class CsvReader implements Iterator |
||||
{ |
||||
|
||||
/** |
||||
* |
||||
* @param string|FileReader $stream |
||||
* @param string $delimiter |
||||
* @param string $enclosure |
||||
* @return CsvReader |
||||
*/ |
||||
static function create($stream, $delimiter = ';', $enclosure = '"') |
||||
{ |
||||
return new self($stream, $delimiter, $enclosure); |
||||
} |
||||
|
||||
protected $stream = null; |
||||
protected $headers = array(); |
||||
protected $delimiter = ''; |
||||
protected $enclosure = ''; |
||||
protected $current = false; |
||||
protected $index = -1; |
||||
|
||||
function __construct($stream, $delimiter = ';', $enclosure = '"') |
||||
{ |
||||
$this->stream = $stream; |
||||
$this->delimiter = $delimiter ? substr($delimiter, 0, 1) : ';'; |
||||
$this->enclosure = $enclosure ? substr($enclosure, 0, 1) : '"'; |
||||
} |
||||
|
||||
function get_delimiter() |
||||
{ |
||||
return $this->delimiter; |
||||
} |
||||
|
||||
function get_enclosure() |
||||
{ |
||||
return $this->enclosure; |
||||
} |
||||
|
||||
function headers() |
||||
{ |
||||
return $this->headers; |
||||
} |
||||
|
||||
/** |
||||
* @return FileReader |
||||
*/ |
||||
function stream() |
||||
{ |
||||
if (is_string($this->stream)) { |
||||
$this->stream = new FileReader($this->stream); |
||||
} |
||||
return $this->stream; |
||||
} |
||||
|
||||
protected function decode($line) |
||||
{ |
||||
if (empty($line)) { |
||||
return array(); |
||||
} |
||||
$data = api_str_getcsv($line, $this->get_delimiter(), $this->get_enclosure()); |
||||
if ($this->headers) { |
||||
$result = array(); |
||||
foreach ($data as $index => $value) { |
||||
$key = isset($this->headers[$index]) ? $this->headers[$index] : false; |
||||
if ($key) { |
||||
$result[$key] = $value; |
||||
} else { |
||||
$result[] = $value; |
||||
} |
||||
} |
||||
} else { |
||||
$result = $data; |
||||
} |
||||
return $result; |
||||
} |
||||
|
||||
/** |
||||
* Returns the next non empty line |
||||
* |
||||
* @return boolean|string |
||||
*/ |
||||
protected function next_line() |
||||
{ |
||||
while (true) { |
||||
$line = $this->stream()->next(); |
||||
if ($line === false) { |
||||
return false; |
||||
} else if ($line) { |
||||
return $line; |
||||
} |
||||
} |
||||
return false; |
||||
} |
||||
|
||||
public function current() |
||||
{ |
||||
return $this->current; |
||||
} |
||||
|
||||
public function key() |
||||
{ |
||||
return $this->index; |
||||
} |
||||
|
||||
public function next() |
||||
{ |
||||
if (empty($this->headers)) { |
||||
$line = $this->next_line(); |
||||
$this->headers = $this->decode($line); |
||||
} |
||||
$line = $this->next_line(); |
||||
if ($line) { |
||||
$this->current = $this->decode($line); |
||||
$this->index++; |
||||
} else { |
||||
$this->current = false; |
||||
} |
||||
return $this->current; |
||||
} |
||||
|
||||
public function rewind() |
||||
{ |
||||
$this->stream()->rewind(); |
||||
$line = $this->stream()->current(); |
||||
if (empty($line)) { |
||||
$line = $this->next_line(); |
||||
} |
||||
$this->headers = $this->decode($line); |
||||
$this->index = -1; |
||||
$this->next(); |
||||
} |
||||
|
||||
public function valid() |
||||
{ |
||||
return $this->current !== false; |
||||
} |
||||
|
||||
function __clone() |
||||
{ |
||||
$this->stream()->rewind(); |
||||
$this->current = false; |
||||
$this->index = -1; |
||||
$this->headers = array(); |
||||
} |
||||
|
||||
} |
||||
@ -0,0 +1,93 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Write array data to a stream in CSV format. Usage: |
||||
* |
||||
* $writer = CsvWriter::create('path'); |
||||
* |
||||
* $writer->put($headers); |
||||
* $writer->put($line_1); |
||||
* $writer->put($line_2); |
||||
* |
||||
* @copyright (c) 2012 University of Geneva |
||||
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html |
||||
* @author Laurent Opprecht <laurent@opprecht.info> |
||||
*/ |
||||
class CsvWriter |
||||
{ |
||||
|
||||
/** |
||||
* |
||||
* @param string|object $stream |
||||
* @return FileWriter |
||||
*/ |
||||
static function create($stream, $delimiter = ';', $enclosure = '"') |
||||
{ |
||||
return new self($stream, $delimiter, $enclosure); |
||||
} |
||||
|
||||
protected $stream = null; |
||||
protected $delimiter = ''; |
||||
protected $enclosure = ''; |
||||
|
||||
function __construct($stream, $delimiter = ';', $enclosure = '"') |
||||
{ |
||||
$this->stream = $stream; |
||||
$this->delimiter = $delimiter ? substr($delimiter, 0, 1) : ';';; |
||||
$this->enclosure = $enclosure ? substr($enclosure, 0, 1) : '"';; |
||||
} |
||||
|
||||
function get_delimiter() |
||||
{ |
||||
return $this->delimiter; |
||||
} |
||||
|
||||
function get_enclosure() |
||||
{ |
||||
return $this->enclosure; |
||||
} |
||||
|
||||
/** |
||||
* |
||||
* @return FileWriter |
||||
*/ |
||||
protected function stream() |
||||
{ |
||||
if (is_string($this->stream)) { |
||||
$this->stream = new FileWriter($this->stream); |
||||
} |
||||
return $this->stream; |
||||
} |
||||
|
||||
function write($items) |
||||
{ |
||||
$this->put($items); |
||||
} |
||||
|
||||
function writeln($items) |
||||
{ |
||||
$this->put($items); |
||||
} |
||||
|
||||
function put($items) |
||||
{ |
||||
$enclosure = $this->enclosure; |
||||
$fields = array(); |
||||
foreach ($items as $item) { |
||||
$fields[] = $enclosure . str_replace($enclosure, $enclosure . $enclosure, $item) . $enclosure; |
||||
} |
||||
|
||||
$delimiter = $this->delimiter; |
||||
$line = implode($delimiter, $fields); |
||||
$this->stream()->writeln($line); |
||||
} |
||||
|
||||
function close() |
||||
{ |
||||
if (is_object($this->stream)) { |
||||
$this->stream->close(); |
||||
} |
||||
$this->stream = null; |
||||
} |
||||
|
||||
} |
||||
@ -0,0 +1,182 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Read text from a file. Reader is line oriented and not char oriented. |
||||
* The default converter converts from the file encoding - auto-detected - to |
||||
* system encoding. |
||||
* |
||||
* Usage: |
||||
* |
||||
* $file = FileReader::create('path'); |
||||
* foreach($file as $line) |
||||
* { |
||||
* ... |
||||
* } |
||||
* |
||||
* @copyright (c) 2012 University of Geneva |
||||
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html |
||||
* @author Laurent Opprecht <laurent@opprecht.info> |
||||
*/ |
||||
class FileReader implements Iterator |
||||
{ |
||||
|
||||
const EOL = "\n"; |
||||
|
||||
/** |
||||
* |
||||
* @param string $path |
||||
* @return FileReader |
||||
*/ |
||||
static function create($path, $converter = null) |
||||
{ |
||||
return new self($path, $converter); |
||||
} |
||||
|
||||
/** |
||||
* Returns the file encoding |
||||
* |
||||
* @return Encoding |
||||
*/ |
||||
static function detect_encoding($path) |
||||
{ |
||||
$abstract = array(); |
||||
// We assume that 200 lines are enough for encoding detection. |
||||
// here we must get at the raw data so we don't use other functions |
||||
// it's not possible to read x chars as this would not be safe with utf |
||||
// (chars may be split in the middle) |
||||
$handle = fopen($path, 'r'); |
||||
|
||||
$i = 0; |
||||
while (($line = fgets($handle)) !== false && $i < 200) { |
||||
$i++; |
||||
$abstract[] = $line; |
||||
} |
||||
fclose($handle); |
||||
$abstract = implode($abstract); |
||||
return Encoding::detect_encoding($abstract); |
||||
} |
||||
|
||||
protected $path = ''; |
||||
protected $handle = null; |
||||
protected $current = false; |
||||
protected $index = -1; |
||||
protected $converter = null; |
||||
|
||||
function __construct($path, $converter = null) |
||||
{ |
||||
if (empty($converter)) { |
||||
$encoding = self::detect_encoding($path); |
||||
$converter = $encoding->decoder(); |
||||
} |
||||
$this->path = $path; |
||||
$this->converter = $converter; |
||||
} |
||||
|
||||
/** |
||||
* |
||||
* @return Converter |
||||
*/ |
||||
function get_converter() |
||||
{ |
||||
return $this->converter; |
||||
} |
||||
|
||||
function handle() |
||||
{ |
||||
if (is_null($this->handle)) { |
||||
$this->handle = fopen($this->path, 'r'); |
||||
} |
||||
return $this->handle; |
||||
} |
||||
|
||||
/** |
||||
* Read at most $count lines. |
||||
* |
||||
* @param int $count |
||||
* @return array |
||||
*/ |
||||
function read_lines($count) |
||||
{ |
||||
$result; |
||||
$i = 0; |
||||
foreach ($this as $line) { |
||||
if ($i >= $count) { |
||||
return $result; |
||||
} |
||||
$i++; |
||||
$result[] = $line; |
||||
} |
||||
return $result; |
||||
} |
||||
|
||||
function read_line() |
||||
{ |
||||
return $this->next(); |
||||
} |
||||
|
||||
function close() |
||||
{ |
||||
if (is_resource($this->handle)) { |
||||
fclose($this->handle); |
||||
} |
||||
$this->handle = null; |
||||
} |
||||
|
||||
protected function convert($text) |
||||
{ |
||||
return $this->converter->convert($text); |
||||
} |
||||
|
||||
public function current() |
||||
{ |
||||
return $this->current; |
||||
} |
||||
|
||||
public function key() |
||||
{ |
||||
return $this->index; |
||||
} |
||||
|
||||
public function next() |
||||
{ |
||||
$handle = $this->handle(); |
||||
if($handle === false) |
||||
{ |
||||
$this->current = false; |
||||
return false; |
||||
} |
||||
$line = fgets($handle); |
||||
if ($line !== false) { |
||||
$line = rtrim($line, "\r\n"); |
||||
$line = $this->convert($line); |
||||
$this->index++; |
||||
} |
||||
$this->current = $line; |
||||
return $this->current; |
||||
} |
||||
|
||||
public function rewind() |
||||
{ |
||||
$this->converter->reset(); |
||||
if ($handle = $this->handle()) { |
||||
rewind($handle); |
||||
} |
||||
$this->current = false; |
||||
$this->index = -1; |
||||
$this->next(); |
||||
} |
||||
|
||||
public function valid() |
||||
{ |
||||
return $this->current !== false; |
||||
} |
||||
|
||||
function __clone() |
||||
{ |
||||
$this->handle = null; |
||||
$this->current = false; |
||||
$this->index = -1; |
||||
$this->converter->reset(); |
||||
} |
||||
|
||||
} |
||||
@ -0,0 +1,81 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Write data to file. Default to UTF8 encoding. |
||||
* |
||||
* @copyright (c) 2012 University of Geneva |
||||
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html |
||||
* @author Laurent Opprecht <laurent@opprecht.info> |
||||
*/ |
||||
class FileWriter |
||||
{ |
||||
|
||||
/** |
||||
* |
||||
* @param string $path |
||||
* @param Converter $converter |
||||
* @return FileWriter |
||||
*/ |
||||
static function create($path, $converter = null) |
||||
{ |
||||
return new self($path, $converter); |
||||
} |
||||
|
||||
const EOL = "\n"; |
||||
|
||||
protected $path = ''; |
||||
protected $handle = null; |
||||
protected $converter = null; |
||||
|
||||
/** |
||||
* |
||||
* @param string $path |
||||
* @param Encoding $encoding |
||||
*/ |
||||
function __construct($path, $converter = null) |
||||
{ |
||||
$this->path = $path; |
||||
$this->converter = $converter ? $converter : Encoding::utf8()->encoder(); |
||||
} |
||||
|
||||
/** |
||||
* |
||||
* @return Converter |
||||
*/ |
||||
function get_converter() |
||||
{ |
||||
return $this->converter; |
||||
} |
||||
|
||||
protected function handle() |
||||
{ |
||||
if (is_null($this->handle)) { |
||||
$this->handle = fopen($this->path, 'a+'); |
||||
} |
||||
return $this->handle; |
||||
} |
||||
|
||||
function write($text) |
||||
{ |
||||
fwrite($this->handle(), $this->convert($text)); |
||||
} |
||||
|
||||
function writeln($text) |
||||
{ |
||||
fwrite($this->handle(), $this->convert($text) . self::EOL); |
||||
} |
||||
|
||||
function close() |
||||
{ |
||||
if (is_resource($this->handle)) { |
||||
fclose($this->handle); |
||||
} |
||||
$this->handle = null; |
||||
} |
||||
|
||||
protected function convert($text) |
||||
{ |
||||
return $this->converter->convert($text); |
||||
} |
||||
|
||||
} |
||||
@ -0,0 +1,33 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Convert text. Used mostly to convert from one encoding to another. |
||||
* |
||||
* @copyright (c) 2012 University of Geneva |
||||
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html |
||||
* @author Laurent Opprecht <laurent@opprecht.info> |
||||
*/ |
||||
class Converter |
||||
{ |
||||
|
||||
/** |
||||
* Identity converter. Returns the string with no transformations. |
||||
* |
||||
* @return Converter |
||||
*/ |
||||
public static function identity() |
||||
{ |
||||
static $result = null; |
||||
if(empty($result)) |
||||
{ |
||||
$result = new self(); |
||||
} |
||||
return $result; |
||||
} |
||||
|
||||
|
||||
function convert($string) |
||||
{ |
||||
return $string; |
||||
} |
||||
} |
||||
@ -0,0 +1,158 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Set the system encoding to the plateform encoding. |
||||
* |
||||
* @todo: |
||||
* Note: those lines are here for ease of use only. They should be move away: |
||||
* |
||||
* 1 first autodetection should be done inside the Encoding class |
||||
* 2 this library should not call a chamilo specific function (this should |
||||
* be the other way around, chamilo calling the encoding functions) |
||||
*/ |
||||
|
||||
$plateform_encoding = api_get_system_encoding(); |
||||
Encoding::system($plateform_encoding); |
||||
|
||||
/** |
||||
* Encoding class. Handles text encoding. Usage: |
||||
* |
||||
* $encoding = Encoding::get('name'); |
||||
* $decoder = $encoding->decoder(); |
||||
* $decoder->convert('text'); |
||||
* |
||||
* The system encoding is the platform/system/default encoding. This defaults to |
||||
* UTF8 but can be changed: |
||||
* |
||||
* Encoding::system('name'); |
||||
* |
||||
* Note that Encoding returns to its name when converted to a string. As such it |
||||
* can be used in places where a string is expected: |
||||
* |
||||
* $utf8 = Encoding::Utf8(); |
||||
* echo $utf8; |
||||
* |
||||
* @copyright (c) 2012 University of Geneva |
||||
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html |
||||
* @author Laurent Opprecht <laurent@opprecht.info> |
||||
*/ |
||||
class Encoding |
||||
{ |
||||
|
||||
private static $system = null; |
||||
|
||||
/** |
||||
* Returns encoding for $name. |
||||
* |
||||
* @param string $name |
||||
* @return Encoding |
||||
*/ |
||||
public static function get($name) |
||||
{ |
||||
if (is_object($name)) { |
||||
return $name; |
||||
} else if (Encoding::utf8()->is($name)) { |
||||
return self::utf8(); |
||||
} else { |
||||
return new self($name); |
||||
} |
||||
} |
||||
|
||||
/** |
||||
* Returns the Utf8 encoding. |
||||
* |
||||
* @return Utf8 |
||||
*/ |
||||
public static function utf8() |
||||
{ |
||||
return Utf8::instance(); |
||||
} |
||||
|
||||
/** |
||||
* Returns/set the system/default encoding. |
||||
* |
||||
* @return Encoding |
||||
*/ |
||||
public static function system($value = null) |
||||
{ |
||||
if (is_object($value)) { |
||||
self::$system = $value; |
||||
} else if (is_string($value)) { |
||||
self::$system = self::get($value); |
||||
} |
||||
|
||||
return self::$system ? self::$system : self::utf8(); |
||||
} |
||||
|
||||
/** |
||||
* Detect encoding from an abstract. |
||||
* |
||||
* @param string $abstract |
||||
* @return Encoding |
||||
*/ |
||||
public static function detect_encoding($abstract) |
||||
{ |
||||
$encoding_name = api_detect_encoding($abstract); |
||||
return self::get($encoding_name); |
||||
} |
||||
|
||||
protected $name = ''; |
||||
|
||||
protected function __construct($name = '') |
||||
{ |
||||
$this->name = $name; |
||||
} |
||||
|
||||
/** |
||||
* The name of the encoding |
||||
* |
||||
* @return string |
||||
*/ |
||||
function name() |
||||
{ |
||||
return $this->name; |
||||
} |
||||
|
||||
/** |
||||
* The Byte Order Mark. |
||||
* |
||||
* @see http://en.wikipedia.org/wiki/Byte_order_mark |
||||
* @return string |
||||
*/ |
||||
function bom() |
||||
{ |
||||
return ''; |
||||
} |
||||
|
||||
/** |
||||
* Returns a decoder that convert encoding to another encoding. |
||||
* |
||||
* @param string|Encoder $to Encoding to convert to, defaults to system encoding |
||||
* @return Converter |
||||
*/ |
||||
public function decoder($to = null) |
||||
{ |
||||
$from = $this; |
||||
$to = $to ? $to : Encoding::system(); |
||||
return EncodingConverter::create($from, $to); |
||||
} |
||||
|
||||
/** |
||||
* Returns an encoder that convert from another encoding to this encoding. |
||||
* |
||||
* @param string|Encoder $from Encoding to convert from, defaults to system encoding. |
||||
* @return Converter |
||||
*/ |
||||
public function encoder($from = null) |
||||
{ |
||||
$from = $from ? $from : Encoding::system(); |
||||
$to = $this; |
||||
return EncodingConverter::create($from, $to); |
||||
} |
||||
|
||||
function __toString() |
||||
{ |
||||
return $this->name(); |
||||
} |
||||
|
||||
} |
||||
@ -0,0 +1,66 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Convert text from one encoding to another. Usage: |
||||
* |
||||
* $converter = EncodingConverter::create($from, $to); |
||||
* $converter->convert($text); |
||||
* |
||||
* Note that the create function will returns an identify converter if from and to |
||||
* encodings are the same. Reason why the constructor is private. |
||||
* |
||||
* @copyright (c) 2012 University of Geneva |
||||
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html |
||||
* @author Laurent Opprecht <laurent@opprecht.info> |
||||
*/ |
||||
class EncodingConverter extends Converter |
||||
{ |
||||
|
||||
/** |
||||
* |
||||
* @param string $from_encoding |
||||
* @param string $to_encoding |
||||
* |
||||
* @return EncodingConverter |
||||
*/ |
||||
public static function create($from_encoding, $to_encoding) |
||||
{ |
||||
$from_encoding = (string) $from_encoding; |
||||
$to_encoding = (string) $to_encoding; |
||||
if (strtolower($from_encoding) == strtolower($to_encoding)) { |
||||
return Converter::identity(); |
||||
} else { |
||||
new self($from_encoding, $to_encoding); |
||||
} |
||||
} |
||||
|
||||
protected $from_encoding; |
||||
protected $to_encoding; |
||||
|
||||
protected function __construct($from_encoding, $to_encoding) |
||||
{ |
||||
$this->from_encoding = $from_encoding; |
||||
$this->to_encoding = $to_encoding; |
||||
} |
||||
|
||||
function from_encoding() |
||||
{ |
||||
return $this->from_encoding; |
||||
} |
||||
|
||||
function to_encoding() |
||||
{ |
||||
return $this->to_encoding; |
||||
} |
||||
|
||||
function convert($string) |
||||
{ |
||||
$from = $this->from_encoding; |
||||
$to = $this->to_encoding; |
||||
if ($from == $to) { |
||||
return $string; |
||||
} |
||||
api_convert_encoding($string, $to, $from); |
||||
} |
||||
|
||||
} |
||||
@ -0,0 +1,287 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Utf8 encoding class. Provides utility function to deal with UTF8 encoding. |
||||
* |
||||
* @license see /license.txt |
||||
* @author Laurent Opprecht <laurent@opprecht.info> for the Univesity of Geneva |
||||
* @author More authors, mentioned in the correpsonding fragments of this source. |
||||
*/ |
||||
class Utf8 extends Encoding |
||||
{ |
||||
|
||||
const PATTERN_NOT_VISIBLE_CHARS = '/[^[:print:]-]/'; //Visible characters and the space character |
||||
|
||||
/** |
||||
* @see http://en.wikipedia.org/wiki/Byte_order_mark |
||||
*/ |
||||
const BOM = "\xEF\xBB\xBF"; |
||||
const NAME = 'UTF-8'; |
||||
|
||||
/** |
||||
* |
||||
* @return Utf8 |
||||
*/ |
||||
public static function instance() |
||||
{ |
||||
static $result = null; |
||||
if (empty($result)) { |
||||
$result = new self(); |
||||
} |
||||
return $result; |
||||
} |
||||
|
||||
/** |
||||
* Returns true if encoding is UTF8. |
||||
* |
||||
* @param string|Encoding $encoding |
||||
* @return bool |
||||
*/ |
||||
function is($encoding) |
||||
{ |
||||
$encoding = (string) $encoding; |
||||
return strtolower($encoding) == strtolower(self::NAME); |
||||
} |
||||
|
||||
protected function __construct() |
||||
{ |
||||
parent::__construct(self::NAME); |
||||
} |
||||
|
||||
function name() |
||||
{ |
||||
return self::NAME; |
||||
} |
||||
|
||||
function bom() |
||||
{ |
||||
return self::BOM; |
||||
} |
||||
|
||||
/** |
||||
* Returns the hexa decimal representation of an utf8 string. Usefull to understand |
||||
* what is going on - not printable chars, rare patterns such as e' for é, etc. |
||||
* |
||||
* @param type $text |
||||
* @return string |
||||
*/ |
||||
function to_hex($text) |
||||
{ |
||||
$result = ''; |
||||
mb_internal_encoding('utf-8'); |
||||
|
||||
for ($i = 0, $n = mb_strlen($text); $i < $n; $i++) { |
||||
$char = mb_substr($text, $i, 1); |
||||
$num = strlen($char); |
||||
for ($j = 0; $j < $num; $j++) { |
||||
$result .= sprintf('%02x', ord($char[$j])); |
||||
} |
||||
$result .= ' '; |
||||
} |
||||
return $result; |
||||
} |
||||
|
||||
/** |
||||
* Trim the BOM from an utf-8 string |
||||
* |
||||
* @param string $text |
||||
* @return string |
||||
*/ |
||||
function trim($text) |
||||
{ |
||||
$bom = self::BOM; |
||||
if (strlen($text) < strlen($bom)) { |
||||
return $text; |
||||
} |
||||
|
||||
if (substr($text, 0, 3) == $bom) { |
||||
return substr($text, 3); |
||||
} |
||||
return $text; |
||||
} |
||||
|
||||
/** |
||||
* Checks a string for UTF-8 validity. |
||||
* |
||||
* @param string $string The string to be tested. |
||||
* @return bool Returns TRUE when the tested string is valid UTF-8, FALSE othewise. |
||||
* @link http://en.wikipedia.org/wiki/UTF-8 |
||||
* @author see internationalization.lib.php |
||||
*/ |
||||
static function is_valid(&$string) |
||||
{ |
||||
|
||||
//return @mb_detect_encoding($string, 'UTF-8', true) == 'UTF-8' ? true : false; |
||||
// Ivan Tcholakov, 05-OCT-2008: I do not trust mb_detect_encoding(). I have |
||||
// found a string with a single cyrillic letter (single byte), that is |
||||
// wrongly detected as UTF-8. Possibly, there would be problems with other |
||||
// languages too. An alternative implementation will be used. |
||||
|
||||
$str = (string) $string; |
||||
$len = api_byte_count($str); |
||||
$i = 0; |
||||
while ($i < $len) { |
||||
$byte1 = ord($str[$i++]); // Here the current character begins. Its size is |
||||
// determined by the senior bits in the first byte. |
||||
|
||||
if (($byte1 & 0x80) == 0x00) { // 0xxxxxxx |
||||
// & |
||||
// 10000000 |
||||
// -------- |
||||
// 00000000 |
||||
// This is s valid character and it contains a single byte. |
||||
} elseif (($byte1 & 0xE0) == 0xC0) { // 110xxxxx 10xxxxxx |
||||
// & & |
||||
// 11100000 11000000 |
||||
// -------- -------- |
||||
// 11000000 10000000 |
||||
// The character contains two bytes. |
||||
if ($i == $len) { |
||||
return false; // Here the string ends unexpectedly. |
||||
} |
||||
|
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) |
||||
return false; // Invalid second byte, invalid string. |
||||
} |
||||
|
||||
elseif (($byte1 & 0xF0) == 0xE0) { // 1110xxxx 10xxxxxx 10xxxxxx |
||||
// & & & |
||||
// 11110000 11000000 11000000 |
||||
// -------- -------- -------- |
||||
// 11100000 10000000 10000000 |
||||
// This is a character of three bytes. |
||||
if ($i == $len) { |
||||
return false; // Unexpected end of the string. |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; // Invalid second byte. |
||||
} |
||||
if ($i == $len) { |
||||
return false; // Unexpected end of the string. |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; // Invalid third byte, invalid string. |
||||
} |
||||
} elseif (($byte1 & 0xF8) == 0xF0) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
||||
// & & & & |
||||
// 11111000 11000000 11000000 11000000 |
||||
// -------- -------- -------- -------- |
||||
// 11110000 10000000 10000000 10000000 |
||||
// This is a character of four bytes. |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
} elseif (($byte1 & 0xFC) == 0xF8) { // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
||||
// & & & & & |
||||
// 11111100 11000000 11000000 11000000 11000000 |
||||
// -------- -------- -------- -------- -------- |
||||
// 11111000 10000000 10000000 10000000 10000000 |
||||
// This is a character of five bytes. |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
} elseif (($byte1 & 0xFE) == 0xFC) { // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
||||
// & & & & & & |
||||
// 11111110 11000000 11000000 11000000 11000000 11000000 |
||||
// -------- -------- -------- -------- -------- -------- |
||||
// 11111100 10000000 10000000 10000000 10000000 10000000 |
||||
// This is a character of six bytes. |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
if ($i == $len) { |
||||
return false; |
||||
} |
||||
if (!((ord($str[$i++]) & 0xC0) == 0x80)) { |
||||
return false; |
||||
} |
||||
} else { |
||||
return false; // In any other case the character is invalid. |
||||
} |
||||
// Here the current character is valid, it |
||||
// matches to some of the cases above. |
||||
// The next character is to be examinated. |
||||
} |
||||
return true; // Empty strings are valid too. |
||||
} |
||||
|
||||
/** |
||||
* |
||||
* @param type $to |
||||
* @return Utf8Decoder |
||||
*/ |
||||
public function decoder($to = null) |
||||
{ |
||||
$to = $to ? $to : Encoding::system(); |
||||
return new Utf8Decoder($to); |
||||
} |
||||
|
||||
/** |
||||
* |
||||
* @param type $from |
||||
* @return Utf8Encoder |
||||
*/ |
||||
public function encoder($from = null) |
||||
{ |
||||
$from = $from ? $from : Encoding::system(); |
||||
return new Utf8Encoder($from); |
||||
} |
||||
|
||||
} |
||||
@ -0,0 +1,54 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Convert from Utf8 to another encoding: |
||||
* |
||||
* - remove BOM |
||||
* - change encoding |
||||
* |
||||
* @copyright (c) 2012 University of Geneva |
||||
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html |
||||
* @author Laurent Opprecht <laurent@opprecht.info> |
||||
*/ |
||||
class Utf8Decoder extends Converter |
||||
{ |
||||
|
||||
protected $started = false; |
||||
protected $to_encoding; |
||||
protected $encoding_converter; |
||||
|
||||
function __construct($to_encoding = null) |
||||
{ |
||||
$this->to_encoding = $to_encoding ? $to_encoding : Encoding::system(); |
||||
$this->encoding_converter = EncodingConverter::create(Utf8::NAME, $this->to_encoding); |
||||
$this->reset(); |
||||
} |
||||
|
||||
function from_encoding() |
||||
{ |
||||
return Utf8::NAME; |
||||
} |
||||
|
||||
function to_encoding() |
||||
{ |
||||
return $this->to_encoding; |
||||
} |
||||
|
||||
function reset() |
||||
{ |
||||
$this->started = false; |
||||
} |
||||
|
||||
function convert($string) |
||||
{ |
||||
if (!$this->started) { |
||||
$this->started = true; |
||||
$string = Utf8::instance()->trim($string); |
||||
return $this->encoding_converter->convert($string); |
||||
} else { |
||||
return $this->encoding_converter->convert($string); |
||||
} |
||||
return $string; |
||||
} |
||||
|
||||
} |
||||
@ -0,0 +1,72 @@ |
||||
<?php |
||||
|
||||
/** |
||||
* Encode from another encoding to UTF8: |
||||
* |
||||
* - add BOM |
||||
* - change encoding |
||||
* - convert html entities if turned on |
||||
* |
||||
* Note: |
||||
* |
||||
* Convert_html_entities cannot but turned on by default. This would be bad |
||||
* for performances but more than anything else it may be perfectly valid to write |
||||
* html entities wihtout transformation - i.e. when writing html content. |
||||
* |
||||
* It may be better to move convert_html_entities to its own converter and to chain |
||||
* converters together to achieve the same result. |
||||
* |
||||
* @copyright (c) 2012 University of Geneva |
||||
* @license GNU General Public License - http://www.gnu.org/copyleft/gpl.html |
||||
* @author Laurent Opprecht <laurent@opprecht.info> |
||||
*/ |
||||
class Utf8Encoder extends Converter |
||||
{ |
||||
|
||||
protected $started = false; |
||||
protected $from_encoding; |
||||
protected $encoding_converter; |
||||
protected $convert_html_entities = false; |
||||
|
||||
function __construct($from_encoding = null , $convert_html_entities = false) |
||||
{ |
||||
$this->from_encoding = $from_encoding ? $from_encoding : Encoding::system(); |
||||
$this->encoding_converter = EncodingConverter::create($this->from_encoding, Utf8::NAME); |
||||
$this->convert_html_entities = $convert_html_entities; |
||||
$this->reset(); |
||||
} |
||||
|
||||
function from_encoding() |
||||
{ |
||||
return $this->from_encoding; |
||||
} |
||||
|
||||
function to_encoding() |
||||
{ |
||||
return Utf8::NAME; |
||||
} |
||||
|
||||
function get_convert_html_entities() |
||||
{ |
||||
return $this->convert_html_entities; |
||||
} |
||||
|
||||
function reset() |
||||
{ |
||||
$this->started = false; |
||||
} |
||||
|
||||
function convert($string) |
||||
{ |
||||
if ($this->convert_html_entities) { |
||||
$string = html_entity_decode($string, ENT_COMPAT, Utf8::NAME); |
||||
} |
||||
$string = $this->encoding_converter->convert($string); |
||||
if (!$this->started) { |
||||
$this->started = true; |
||||
$string = Utf8::BOM . $string; |
||||
} |
||||
return $string; |
||||
} |
||||
|
||||
} |
||||
Loading…
Reference in new issue