I have committed many support files for CREATE CONVERSION. Default
conversion procs and conversions are added in initdb. Currently
supported conversions are:
UTF-8(UNICODE) <--> SQL_ASCII, ISO-8859-1 to 16, EUC_JP, EUC_KR,
EUC_CN, EUC_TW, SJIS, BIG5, GBK, GB18030, UHC,
JOHAB, TCVN
EUC_JP <--> SJIS
EUC_TW <--> BIG5
MULE_INTERNAL <--> EUC_JP, SJIS, EUC_TW, BIG5
Note that initial contents of pg_conversion system catalog are created
in the initdb process. So doing initdb required is ideal, it's
possible to add them to your databases by hand, however. To accomplish
this:
psql -f your_postgresql_install_path/share/conversion_create.sql your_database
So I did not bump up the version in cataversion.h.
TODO:
Add more conversion procs
Add [CASCADE|RESTRICT] to DROP CONVERSION
Add tuples to pg_depend
Add regression tests
Write docs
Add SQL99 CONVERT command?
--
Tatsuo Ishii
23 years ago
|
|
|
/*-------------------------------------------------------------------------
|
|
|
|
*
|
I have committed many support files for CREATE CONVERSION. Default
conversion procs and conversions are added in initdb. Currently
supported conversions are:
UTF-8(UNICODE) <--> SQL_ASCII, ISO-8859-1 to 16, EUC_JP, EUC_KR,
EUC_CN, EUC_TW, SJIS, BIG5, GBK, GB18030, UHC,
JOHAB, TCVN
EUC_JP <--> SJIS
EUC_TW <--> BIG5
MULE_INTERNAL <--> EUC_JP, SJIS, EUC_TW, BIG5
Note that initial contents of pg_conversion system catalog are created
in the initdb process. So doing initdb required is ideal, it's
possible to add them to your databases by hand, however. To accomplish
this:
psql -f your_postgresql_install_path/share/conversion_create.sql your_database
So I did not bump up the version in cataversion.h.
TODO:
Add more conversion procs
Add [CASCADE|RESTRICT] to DROP CONVERSION
Add tuples to pg_depend
Add regression tests
Write docs
Add SQL99 CONVERT command?
--
Tatsuo Ishii
23 years ago
|
|
|
* Utility functions for conversion procs.
|
|
|
|
*
|
|
|
|
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
|
I have committed many support files for CREATE CONVERSION. Default
conversion procs and conversions are added in initdb. Currently
supported conversions are:
UTF-8(UNICODE) <--> SQL_ASCII, ISO-8859-1 to 16, EUC_JP, EUC_KR,
EUC_CN, EUC_TW, SJIS, BIG5, GBK, GB18030, UHC,
JOHAB, TCVN
EUC_JP <--> SJIS
EUC_TW <--> BIG5
MULE_INTERNAL <--> EUC_JP, SJIS, EUC_TW, BIG5
Note that initial contents of pg_conversion system catalog are created
in the initdb process. So doing initdb required is ideal, it's
possible to add them to your databases by hand, however. To accomplish
this:
psql -f your_postgresql_install_path/share/conversion_create.sql your_database
So I did not bump up the version in cataversion.h.
TODO:
Add more conversion procs
Add [CASCADE|RESTRICT] to DROP CONVERSION
Add tuples to pg_depend
Add regression tests
Write docs
Add SQL99 CONVERT command?
--
Tatsuo Ishii
23 years ago
|
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
|
|
*
|
I have committed many support files for CREATE CONVERSION. Default
conversion procs and conversions are added in initdb. Currently
supported conversions are:
UTF-8(UNICODE) <--> SQL_ASCII, ISO-8859-1 to 16, EUC_JP, EUC_KR,
EUC_CN, EUC_TW, SJIS, BIG5, GBK, GB18030, UHC,
JOHAB, TCVN
EUC_JP <--> SJIS
EUC_TW <--> BIG5
MULE_INTERNAL <--> EUC_JP, SJIS, EUC_TW, BIG5
Note that initial contents of pg_conversion system catalog are created
in the initdb process. So doing initdb required is ideal, it's
possible to add them to your databases by hand, however. To accomplish
this:
psql -f your_postgresql_install_path/share/conversion_create.sql your_database
So I did not bump up the version in cataversion.h.
TODO:
Add more conversion procs
Add [CASCADE|RESTRICT] to DROP CONVERSION
Add tuples to pg_depend
Add regression tests
Write docs
Add SQL99 CONVERT command?
--
Tatsuo Ishii
23 years ago
|
|
|
* IDENTIFICATION
|
|
|
|
* $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.68 2010/01/02 16:57:56 momjian Exp $
|
|
|
|
*
|
I have committed many support files for CREATE CONVERSION. Default
conversion procs and conversions are added in initdb. Currently
supported conversions are:
UTF-8(UNICODE) <--> SQL_ASCII, ISO-8859-1 to 16, EUC_JP, EUC_KR,
EUC_CN, EUC_TW, SJIS, BIG5, GBK, GB18030, UHC,
JOHAB, TCVN
EUC_JP <--> SJIS
EUC_TW <--> BIG5
MULE_INTERNAL <--> EUC_JP, SJIS, EUC_TW, BIG5
Note that initial contents of pg_conversion system catalog are created
in the initdb process. So doing initdb required is ideal, it's
possible to add them to your databases by hand, however. To accomplish
this:
psql -f your_postgresql_install_path/share/conversion_create.sql your_database
So I did not bump up the version in cataversion.h.
TODO:
Add more conversion procs
Add [CASCADE|RESTRICT] to DROP CONVERSION
Add tuples to pg_depend
Add regression tests
Write docs
Add SQL99 CONVERT command?
--
Tatsuo Ishii
23 years ago
|
|
|
*-------------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "mb/pg_wchar.h"
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* LATINn ---> MIC when the charset's local codes map directly to MIC
|
|
|
|
*
|
|
|
|
* l points to the source string of length len
|
|
|
|
* p is the output area (must be large enough!)
|
|
|
|
* lc is the mule character set id for the local encoding
|
|
|
|
* encoding is the PG identifier for the local encoding
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
latin2mic(const unsigned char *l, unsigned char *p, int len,
|
|
|
|
int lc, int encoding)
|
|
|
|
{
|
|
|
|
int c1;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
c1 = *l;
|
|
|
|
if (c1 == 0)
|
|
|
|
report_invalid_encoding(encoding, (const char *) l, len);
|
|
|
|
if (IS_HIGHBIT_SET(c1))
|
|
|
|
*p++ = lc;
|
|
|
|
*p++ = c1;
|
|
|
|
l++;
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
*p = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MIC ---> LATINn when the charset's local codes map directly to MIC
|
|
|
|
*
|
|
|
|
* mic points to the source string of length len
|
|
|
|
* p is the output area (must be large enough!)
|
|
|
|
* lc is the mule character set id for the local encoding
|
|
|
|
* encoding is the PG identifier for the local encoding
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
mic2latin(const unsigned char *mic, unsigned char *p, int len,
|
|
|
|
int lc, int encoding)
|
|
|
|
{
|
|
|
|
int c1;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
c1 = *mic;
|
|
|
|
if (c1 == 0)
|
|
|
|
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
|
|
|
|
if (!IS_HIGHBIT_SET(c1))
|
|
|
|
{
|
|
|
|
/* easy for ASCII */
|
|
|
|
*p++ = c1;
|
|
|
|
mic++;
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int l = pg_mic_mblen(mic);
|
|
|
|
|
|
|
|
if (len < l)
|
|
|
|
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
|
|
|
|
len);
|
|
|
|
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
|
|
|
|
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
|
|
|
|
(const char *) mic, len);
|
|
|
|
*p++ = mic[1];
|
|
|
|
mic += 2;
|
|
|
|
len -= 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*p = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* ASCII ---> MIC
|
|
|
|
*
|
|
|
|
* While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
|
|
|
|
* characters, here we must take a hard line because we don't know
|
|
|
|
* the appropriate MIC equivalent.
|
|
|
|
*/
|
I have committed many support files for CREATE CONVERSION. Default
conversion procs and conversions are added in initdb. Currently
supported conversions are:
UTF-8(UNICODE) <--> SQL_ASCII, ISO-8859-1 to 16, EUC_JP, EUC_KR,
EUC_CN, EUC_TW, SJIS, BIG5, GBK, GB18030, UHC,
JOHAB, TCVN
EUC_JP <--> SJIS
EUC_TW <--> BIG5
MULE_INTERNAL <--> EUC_JP, SJIS, EUC_TW, BIG5
Note that initial contents of pg_conversion system catalog are created
in the initdb process. So doing initdb required is ideal, it's
possible to add them to your databases by hand, however. To accomplish
this:
psql -f your_postgresql_install_path/share/conversion_create.sql your_database
So I did not bump up the version in cataversion.h.
TODO:
Add more conversion procs
Add [CASCADE|RESTRICT] to DROP CONVERSION
Add tuples to pg_depend
Add regression tests
Write docs
Add SQL99 CONVERT command?
--
Tatsuo Ishii
23 years ago
|
|
|
void
|
|
|
|
pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
|
|
|
|
{
|
|
|
|
int c1;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
c1 = *l;
|
|
|
|
if (c1 == 0 || IS_HIGHBIT_SET(c1))
|
|
|
|
report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
|
|
|
|
*p++ = c1;
|
|
|
|
l++;
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
*p = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MIC ---> ASCII
|
|
|
|
*/
|
I have committed many support files for CREATE CONVERSION. Default
conversion procs and conversions are added in initdb. Currently
supported conversions are:
UTF-8(UNICODE) <--> SQL_ASCII, ISO-8859-1 to 16, EUC_JP, EUC_KR,
EUC_CN, EUC_TW, SJIS, BIG5, GBK, GB18030, UHC,
JOHAB, TCVN
EUC_JP <--> SJIS
EUC_TW <--> BIG5
MULE_INTERNAL <--> EUC_JP, SJIS, EUC_TW, BIG5
Note that initial contents of pg_conversion system catalog are created
in the initdb process. So doing initdb required is ideal, it's
possible to add them to your databases by hand, however. To accomplish
this:
psql -f your_postgresql_install_path/share/conversion_create.sql your_database
So I did not bump up the version in cataversion.h.
TODO:
Add more conversion procs
Add [CASCADE|RESTRICT] to DROP CONVERSION
Add tuples to pg_depend
Add regression tests
Write docs
Add SQL99 CONVERT command?
--
Tatsuo Ishii
23 years ago
|
|
|
void
|
|
|
|
pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
|
|
|
|
{
|
|
|
|
int c1;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
c1 = *mic;
|
|
|
|
if (c1 == 0 || IS_HIGHBIT_SET(c1))
|
|
|
|
report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
|
|
|
|
(const char *) mic, len);
|
|
|
|
*p++ = c1;
|
|
|
|
mic++;
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
*p = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* latin2mic_with_table: a generic single byte charset encoding
|
|
|
|
* conversion from a local charset to the mule internal code.
|
|
|
|
*
|
|
|
|
* l points to the source string of length len
|
|
|
|
* p is the output area (must be large enough!)
|
|
|
|
* lc is the mule character set id for the local encoding
|
|
|
|
* encoding is the PG identifier for the local encoding
|
|
|
|
* tab holds conversion entries for the local charset
|
|
|
|
* starting from 128 (0x80). each entry in the table
|
|
|
|
* holds the corresponding code point for the mule internal code.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
latin2mic_with_table(const unsigned char *l,
|
|
|
|
unsigned char *p,
|
|
|
|
int len,
|
|
|
|
int lc,
|
|
|
|
int encoding,
|
|
|
|
const unsigned char *tab)
|
|
|
|
{
|
|
|
|
unsigned char c1,
|
|
|
|
c2;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
c1 = *l;
|
|
|
|
if (c1 == 0)
|
|
|
|
report_invalid_encoding(encoding, (const char *) l, len);
|
|
|
|
if (!IS_HIGHBIT_SET(c1))
|
|
|
|
*p++ = c1;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
c2 = tab[c1 - HIGHBIT];
|
|
|
|
if (c2)
|
|
|
|
{
|
|
|
|
*p++ = lc;
|
|
|
|
*p++ = c2;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
report_untranslatable_char(encoding, PG_MULE_INTERNAL,
|
|
|
|
(const char *) l, len);
|
|
|
|
}
|
|
|
|
l++;
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
*p = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mic2latin_with_table: a generic single byte charset encoding
|
|
|
|
* conversion from the mule internal code to a local charset.
|
|
|
|
*
|
|
|
|
* mic points to the source string of length len
|
|
|
|
* p is the output area (must be large enough!)
|
|
|
|
* lc is the mule character set id for the local encoding
|
|
|
|
* encoding is the PG identifier for the local encoding
|
|
|
|
* tab holds conversion entries for the mule internal code's
|
|
|
|
* second byte, starting from 128 (0x80). each entry in the table
|
|
|
|
* holds the corresponding code point for the local charset.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
mic2latin_with_table(const unsigned char *mic,
|
|
|
|
unsigned char *p,
|
|
|
|
int len,
|
|
|
|
int lc,
|
|
|
|
int encoding,
|
|
|
|
const unsigned char *tab)
|
|
|
|
{
|
|
|
|
unsigned char c1,
|
|
|
|
c2;
|
|
|
|
|
|
|
|
while (len > 0)
|
|
|
|
{
|
|
|
|
c1 = *mic;
|
|
|
|
if (c1 == 0)
|
|
|
|
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
|
|
|
|
if (!IS_HIGHBIT_SET(c1))
|
|
|
|
{
|
|
|
|
/* easy for ASCII */
|
|
|
|
*p++ = c1;
|
|
|
|
mic++;
|
|
|
|
len--;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int l = pg_mic_mblen(mic);
|
|
|
|
|
|
|
|
if (len < l)
|
|
|
|
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
|
|
|
|
len);
|
|
|
|
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
|
|
|
|
(c2 = tab[mic[1] - HIGHBIT]) == 0)
|
|
|
|
{
|
|
|
|
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
|
|
|
|
(const char *) mic, len);
|
|
|
|
break; /* keep compiler quiet */
|
|
|
|
}
|
|
|
|
*p++ = c2;
|
|
|
|
mic += 2;
|
|
|
|
len -= 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
*p = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* comparison routine for bsearch()
|
|
|
|
* this routine is intended for UTF8 -> local code
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
compare1(const void *p1, const void *p2)
|
|
|
|
{
|
|
|
|
uint32 v1,
|
|
|
|
v2;
|
|
|
|
|
|
|
|
v1 = *(uint32 *) p1;
|
|
|
|
v2 = ((pg_utf_to_local *) p2)->utf;
|
|
|
|
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* comparison routine for bsearch()
|
|
|
|
* this routine is intended for local code -> UTF8
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
compare2(const void *p1, const void *p2)
|
|
|
|
{
|
|
|
|
uint32 v1,
|
|
|
|
v2;
|
|
|
|
|
|
|
|
v1 = *(uint32 *) p1;
|
|
|
|
v2 = ((pg_local_to_utf *) p2)->code;
|
|
|
|
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* comparison routine for bsearch()
|
|
|
|
* this routine is intended for combined UTF8 -> local code
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
compare3(const void *p1, const void *p2)
|
|
|
|
{
|
|
|
|
uint32 s1,
|
|
|
|
s2,
|
|
|
|
d1,
|
|
|
|
d2;
|
|
|
|
|
|
|
|
s1 = *(uint32 *) p1;
|
|
|
|
s2 = *((uint32 *) p1 + 1);
|
|
|
|
d1 = ((pg_utf_to_local_combined *) p2)->utf1;
|
|
|
|
d2 = ((pg_utf_to_local_combined *) p2)->utf2;
|
|
|
|
return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* comparison routine for bsearch()
|
|
|
|
* this routine is intended for local code -> combined UTF8
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
compare4(const void *p1, const void *p2)
|
|
|
|
{
|
|
|
|
uint32 v1,
|
|
|
|
v2;
|
|
|
|
|
|
|
|
v1 = *(uint32 *) p1;
|
|
|
|
v2 = ((pg_local_to_utf_combined *) p2)->code;
|
|
|
|
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* convert 32bit wide character to mutibye stream pointed to by iso
|
|
|
|
*/
|
|
|
|
static unsigned char *
|
|
|
|
set_iso_code(unsigned char *iso, uint32 code)
|
|
|
|
{
|
|
|
|
if (code & 0xff000000)
|
|
|
|
*iso++ = code >> 24;
|
|
|
|
if (code & 0x00ff0000)
|
|
|
|
*iso++ = (code & 0x00ff0000) >> 16;
|
|
|
|
if (code & 0x0000ff00)
|
|
|
|
*iso++ = (code & 0x0000ff00) >> 8;
|
|
|
|
if (code & 0x000000ff)
|
|
|
|
*iso++ = code & 0x000000ff;
|
|
|
|
return iso;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* UTF8 ---> local code
|
|
|
|
*
|
|
|
|
* utf: input UTF8 string (need not be null-terminated).
|
|
|
|
* iso: pointer to the output area (must be large enough!)
|
|
|
|
* map: the conversion map.
|
|
|
|
* cmap: the conversion map for combined characters.
|
|
|
|
* (optional)
|
|
|
|
* size1: the size of the conversion map.
|
|
|
|
* size2: the size of the conversion map for combined characters
|
|
|
|
* (optional)
|
|
|
|
* encoding: the PG identifier for the local encoding.
|
|
|
|
* len: length of input string.
|
|
|
|
*/
|
I have committed many support files for CREATE CONVERSION. Default
conversion procs and conversions are added in initdb. Currently
supported conversions are:
UTF-8(UNICODE) <--> SQL_ASCII, ISO-8859-1 to 16, EUC_JP, EUC_KR,
EUC_CN, EUC_TW, SJIS, BIG5, GBK, GB18030, UHC,
JOHAB, TCVN
EUC_JP <--> SJIS
EUC_TW <--> BIG5
MULE_INTERNAL <--> EUC_JP, SJIS, EUC_TW, BIG5
Note that initial contents of pg_conversion system catalog are created
in the initdb process. So doing initdb required is ideal, it's
possible to add them to your databases by hand, however. To accomplish
this:
psql -f your_postgresql_install_path/share/conversion_create.sql your_database
So I did not bump up the version in cataversion.h.
TODO:
Add more conversion procs
Add [CASCADE|RESTRICT] to DROP CONVERSION
Add tuples to pg_depend
Add regression tests
Write docs
Add SQL99 CONVERT command?
--
Tatsuo Ishii
23 years ago
|
|
|
void
|
|
|
|
UtfToLocal(const unsigned char *utf, unsigned char *iso,
|
|
|
|
const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
|
|
|
|
int size1, int size2, int encoding, int len)
|
|
|
|
{
|
|
|
|
uint32 iutf;
|
|
|
|
uint32 cutf[2];
|
|
|
|
uint32 code;
|
|
|
|
pg_utf_to_local *p;
|
|
|
|
pg_utf_to_local_combined *cp;
|
|
|
|
int l;
|
|
|
|
|
|
|
|
for (; len > 0; len -= l)
|
|
|
|
{
|
|
|
|
/* "break" cases all represent errors */
|
|
|
|
if (*utf == '\0')
|
|
|
|
break;
|
|
|
|
|
|
|
|
l = pg_utf_mblen(utf);
|
|
|
|
|
|
|
|
if (len < l)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!pg_utf8_islegal(utf, l))
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (l == 1)
|
|
|
|
{
|
|
|
|
/* ASCII case is easy */
|
|
|
|
*iso++ = *utf++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else if (l == 2)
|
|
|
|
{
|
|
|
|
iutf = *utf++ << 8;
|
|
|
|
iutf |= *utf++;
|
|
|
|
}
|
|
|
|
else if (l == 3)
|
|
|
|
{
|
|
|
|
iutf = *utf++ << 16;
|
|
|
|
iutf |= *utf++ << 8;
|
|
|
|
iutf |= *utf++;
|
|
|
|
}
|
|
|
|
else if (l == 4)
|
|
|
|
{
|
|
|
|
iutf = *utf++ << 24;
|
|
|
|
iutf |= *utf++ << 16;
|
|
|
|
iutf |= *utf++ << 8;
|
|
|
|
iutf |= *utf++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* first, try with combined map if possible
|
|
|
|
*/
|
|
|
|
if (cmap && len > l)
|
|
|
|
{
|
|
|
|
const unsigned char *utf_save = utf;
|
|
|
|
int len_save = len;
|
|
|
|
int l_save = l;
|
|
|
|
|
|
|
|
len -= l;
|
|
|
|
|
|
|
|
l = pg_utf_mblen(utf);
|
|
|
|
if (len < l)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!pg_utf8_islegal(utf, l))
|
|
|
|
break;
|
|
|
|
|
|
|
|
cutf[0] = iutf;
|
|
|
|
|
|
|
|
if (l == 1)
|
|
|
|
{
|
|
|
|
if (len_save > 1)
|
|
|
|
{
|
|
|
|
p = bsearch(&cutf[0], map, size1,
|
|
|
|
sizeof(pg_utf_to_local), compare1);
|
|
|
|
if (p == NULL)
|
|
|
|
report_untranslatable_char(PG_UTF8, encoding,
|
|
|
|
(const char *) (utf_save - l_save), len_save);
|
|
|
|
iso = set_iso_code(iso, p->code);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ASCII case is easy */
|
|
|
|
*iso++ = *utf++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
else if (l == 2)
|
|
|
|
{
|
|
|
|
iutf = *utf++ << 8;
|
|
|
|
iutf |= *utf++;
|
|
|
|
}
|
|
|
|
else if (l == 3)
|
|
|
|
{
|
|
|
|
iutf = *utf++ << 16;
|
|
|
|
iutf |= *utf++ << 8;
|
|
|
|
iutf |= *utf++;
|
|
|
|
}
|
|
|
|
else if (l == 4)
|
|
|
|
{
|
|
|
|
iutf = *utf++ << 24;
|
|
|
|
iutf |= *utf++ << 16;
|
|
|
|
iutf |= *utf++ << 8;
|
|
|
|
iutf |= *utf++;
|
|
|
|
}
|
|
|
|
|
|
|
|
cutf[1] = iutf;
|
|
|
|
cp = bsearch(cutf, cmap, size2,
|
|
|
|
sizeof(pg_utf_to_local_combined), compare3);
|
|
|
|
if (cp)
|
|
|
|
code = cp->code;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/* not found in combined map. try with ordinary map */
|
|
|
|
p = bsearch(&cutf[0], map, size1,
|
|
|
|
sizeof(pg_utf_to_local), compare1);
|
|
|
|
if (p == NULL)
|
|
|
|
report_untranslatable_char(PG_UTF8, encoding,
|
|
|
|
(const char *) (utf_save - l_save), len_save);
|
|
|
|
iso = set_iso_code(iso, p->code);
|
|
|
|
|
|
|
|
p = bsearch(&cutf[1], map, size1,
|
|
|
|
sizeof(pg_utf_to_local), compare1);
|
|
|
|
if (p == NULL)
|
|
|
|
report_untranslatable_char(PG_UTF8, encoding,
|
|
|
|
(const char *) (utf - l), len);
|
|
|
|
code = p->code;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else /* no cmap or no remaining data */
|
|
|
|
{
|
|
|
|
p = bsearch(&iutf, map, size1,
|
|
|
|
sizeof(pg_utf_to_local), compare1);
|
|
|
|
if (p == NULL)
|
|
|
|
report_untranslatable_char(PG_UTF8, encoding,
|
|
|
|
(const char *) (utf - l), len);
|
|
|
|
code = p->code;
|
|
|
|
}
|
|
|
|
iso = set_iso_code(iso, code);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (len > 0)
|
|
|
|
report_invalid_encoding(PG_UTF8, (const char *) utf, len);
|
|
|
|
|
|
|
|
*iso = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* local code ---> UTF8
|
|
|
|
*
|
|
|
|
* iso: input local string (need not be null-terminated).
|
|
|
|
* utf: pointer to the output area (must be large enough!)
|
|
|
|
* map: the conversion map.
|
|
|
|
* cmap: the conversion map for combined characters.
|
|
|
|
* (optional)
|
|
|
|
* size1: the size of the conversion map.
|
|
|
|
* size2: the size of the conversion map for combined characters
|
|
|
|
* (optional)
|
|
|
|
* encoding: the PG identifier for the local encoding.
|
|
|
|
* len: length of input string.
|
|
|
|
*/
|
I have committed many support files for CREATE CONVERSION. Default
conversion procs and conversions are added in initdb. Currently
supported conversions are:
UTF-8(UNICODE) <--> SQL_ASCII, ISO-8859-1 to 16, EUC_JP, EUC_KR,
EUC_CN, EUC_TW, SJIS, BIG5, GBK, GB18030, UHC,
JOHAB, TCVN
EUC_JP <--> SJIS
EUC_TW <--> BIG5
MULE_INTERNAL <--> EUC_JP, SJIS, EUC_TW, BIG5
Note that initial contents of pg_conversion system catalog are created
in the initdb process. So doing initdb required is ideal, it's
possible to add them to your databases by hand, however. To accomplish
this:
psql -f your_postgresql_install_path/share/conversion_create.sql your_database
So I did not bump up the version in cataversion.h.
TODO:
Add more conversion procs
Add [CASCADE|RESTRICT] to DROP CONVERSION
Add tuples to pg_depend
Add regression tests
Write docs
Add SQL99 CONVERT command?
--
Tatsuo Ishii
23 years ago
|
|
|
void
|
|
|
|
LocalToUtf(const unsigned char *iso, unsigned char *utf,
|
|
|
|
const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
|
|
|
|
int size1, int size2, int encoding, int len)
|
|
|
|
{
|
|
|
|
unsigned int iiso;
|
|
|
|
int l;
|
|
|
|
pg_local_to_utf *p;
|
|
|
|
pg_local_to_utf_combined *cp;
|
|
|
|
|
Commit Karel's patch.
-------------------------------------------------------------------
Subject: Re: [PATCHES] encoding names
From: Karel Zak <zakkr@zf.jcu.cz>
To: Peter Eisentraut <peter_e@gmx.net>
Cc: pgsql-patches <pgsql-patches@postgresql.org>
Date: Fri, 31 Aug 2001 17:24:38 +0200
On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote:
> > - convert encoding 'name' to 'id'
>
> I thought we decided not to add functions returning "new" names until we
> know exactly what the new names should be, and pending schema
Ok, the patch not to add functions.
> better
>
> ...(): encoding name too long
Fixed.
I found new bug in command/variable.c in parse_client_encoding(), nobody
probably never see this error:
if (pg_set_client_encoding(encoding))
{
elog(ERROR, "Conversion between %s and %s is not supported",
value, GetDatabaseEncodingName());
}
because pg_set_client_encoding() returns -1 for error and 0 as true.
It's fixed too.
IMHO it can be apply.
Karel
PS:
* following files are renamed:
src/utils/mb/Unicode/KOI8_to_utf8.map -->
src/utils/mb/Unicode/koi8r_to_utf8.map
src/utils/mb/Unicode/WIN_to_utf8.map -->
src/utils/mb/Unicode/win1251_to_utf8.map
src/utils/mb/Unicode/utf8_to_KOI8.map -->
src/utils/mb/Unicode/utf8_to_koi8r.map
src/utils/mb/Unicode/utf8_to_WIN.map -->
src/utils/mb/Unicode/utf8_to_win1251.map
* new file:
src/utils/mb/encname.c
* removed file:
src/utils/mb/common.c
--
Karel Zak <zakkr@zf.jcu.cz>
http://home.zf.jcu.cz/~zakkr/
C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
24 years ago
|
|
|
if (!PG_VALID_ENCODING(encoding))
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
|
|
errmsg("invalid encoding number: %d", encoding)));
|
|
|
|
|
|
|
|
for (; len > 0; len -= l)
|
|
|
|
{
|
|
|
|
/* "break" cases all represent errors */
|
|
|
|
if (*iso == '\0')
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!IS_HIGHBIT_SET(*iso))
|
|
|
|
{
|
|
|
|
/* ASCII case is easy */
|
|
|
|
*utf++ = *iso++;
|
|
|
|
l = 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
l = pg_encoding_verifymb(encoding, (const char *) iso, len);
|
|
|
|
if (l < 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (l == 1)
|
|
|
|
iiso = *iso++;
|
|
|
|
else if (l == 2)
|
|
|
|
{
|
|
|
|
iiso = *iso++ << 8;
|
|
|
|
iiso |= *iso++;
|
|
|
|
}
|
|
|
|
else if (l == 3)
|
|
|
|
{
|
|
|
|
iiso = *iso++ << 16;
|
|
|
|
iiso |= *iso++ << 8;
|
|
|
|
iiso |= *iso++;
|
|
|
|
}
|
|
|
|
else if (l == 4)
|
|
|
|
{
|
|
|
|
iiso = *iso++ << 24;
|
|
|
|
iiso |= *iso++ << 16;
|
|
|
|
iiso |= *iso++ << 8;
|
|
|
|
iiso |= *iso++;
|
|
|
|
}
|
|
|
|
|
|
|
|
p = bsearch(&iiso, map, size1,
|
|
|
|
sizeof(pg_local_to_utf), compare2);
|
|
|
|
|
|
|
|
if (p == NULL)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* not found in the ordinary map. if there's a combined character
|
|
|
|
* map, try with it
|
|
|
|
*/
|
|
|
|
if (cmap)
|
|
|
|
{
|
|
|
|
cp = bsearch(&iiso, cmap, size2,
|
|
|
|
sizeof(pg_local_to_utf_combined), compare4);
|
|
|
|
|
|
|
|
if (cp)
|
|
|
|
{
|
|
|
|
if (cp->utf1 & 0xff000000)
|
|
|
|
*utf++ = cp->utf1 >> 24;
|
|
|
|
if (cp->utf1 & 0x00ff0000)
|
|
|
|
*utf++ = (cp->utf1 & 0x00ff0000) >> 16;
|
|
|
|
if (cp->utf1 & 0x0000ff00)
|
|
|
|
*utf++ = (cp->utf1 & 0x0000ff00) >> 8;
|
|
|
|
if (cp->utf1 & 0x000000ff)
|
|
|
|
*utf++ = cp->utf1 & 0x000000ff;
|
|
|
|
|
|
|
|
if (cp->utf2 & 0xff000000)
|
|
|
|
*utf++ = cp->utf2 >> 24;
|
|
|
|
if (cp->utf2 & 0x00ff0000)
|
|
|
|
*utf++ = (cp->utf2 & 0x00ff0000) >> 16;
|
|
|
|
if (cp->utf2 & 0x0000ff00)
|
|
|
|
*utf++ = (cp->utf2 & 0x0000ff00) >> 8;
|
|
|
|
if (cp->utf2 & 0x000000ff)
|
|
|
|
*utf++ = cp->utf2 & 0x000000ff;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
report_untranslatable_char(encoding, PG_UTF8,
|
|
|
|
(const char *) (iso - l), len);
|
|
|
|
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (p->utf & 0xff000000)
|
|
|
|
*utf++ = p->utf >> 24;
|
|
|
|
if (p->utf & 0x00ff0000)
|
|
|
|
*utf++ = (p->utf & 0x00ff0000) >> 16;
|
|
|
|
if (p->utf & 0x0000ff00)
|
|
|
|
*utf++ = (p->utf & 0x0000ff00) >> 8;
|
|
|
|
if (p->utf & 0x000000ff)
|
|
|
|
*utf++ = p->utf & 0x000000ff;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (len > 0)
|
|
|
|
report_invalid_encoding(encoding, (const char *) iso, len);
|
|
|
|
|
|
|
|
*utf = '\0';
|
|
|
|
}
|