You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
postgres/src/backend/utils/mb/conv.c

453 lines
8.1 KiB

/*-------------------------------------------------------------------------
*
* Utility functions for conversion procs.
*
* Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/utils/mb/conv.c,v 1.45 2003/04/12 07:53:57 ishii Exp $
*
*-------------------------------------------------------------------------
28 years ago
*/
#include "postgres.h"
#include "mb/pg_wchar.h"
28 years ago
/*
* convert bogus chars that cannot be represented in the current
* encoding system.
28 years ago
*/
void
pg_print_bogus_char(unsigned char **mic, unsigned char **p)
28 years ago
{
char strbuf[16];
int l = pg_mic_mblen(*mic);
*(*p)++ = '(';
while (l--)
{
sprintf(strbuf, "%02x", *(*mic)++);
*(*p)++ = strbuf[0];
*(*p)++ = strbuf[1];
}
*(*p)++ = ')';
28 years ago
}
#ifdef NOT_USED
28 years ago
/*
* GB18030 ---> MIC
* Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
*/
static void
gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
{
int c1;
int c2;
while (len > 0 && (c1 = *gb18030++))
{
if (c1 < 0x80)
{ /* should be ASCII */
len--;
*p++ = c1;
}
23 years ago
else if (c1 >= 0x81 && c1 <= 0xfe)
{
c2 = *gb18030++;
23 years ago
if (c2 >= 0x30 && c2 <= 0x69)
{
len -= 4;
*p++ = c1;
*p++ = c2;
*p++ = *gb18030++;
*p++ = *gb18030++;
*p++ = *gb18030++;
}
23 years ago
else if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
{
len -= 2;
*p++ = c1;
*p++ = c2;
*p++ = *gb18030++;
}
23 years ago
else
{ /* throw the strange code */
len--;
}
}
}
*p = '\0';
}
/*
* MIC ---> GB18030
* Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
*/
static void
mic2gb18030(unsigned char *mic, unsigned char *p, int len)
{
int c1;
int c2;
while (len > 0 && (c1 = *mic))
{
len -= pg_mic_mblen(mic++);
23 years ago
if (c1 <= 0x7f) /* ASCII */
*p++ = c1;
else if (c1 >= 0x81 && c1 <= 0xfe)
23 years ago
{
c2 = *mic++;
23 years ago
if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
{
*p++ = c1;
*p++ = c2;
}
23 years ago
else if (c2 >= 0x30 && c2 <= 0x39)
{
*p++ = c1;
*p++ = c2;
*p++ = *mic++;
*p++ = *mic++;
23 years ago
}
else
{
mic--;
pg_print_bogus_char(&mic, &p);
mic--;
pg_print_bogus_char(&mic, &p);
23 years ago
}
}
23 years ago
else
{
mic--;
pg_print_bogus_char(&mic, &p);
}
}
*p = '\0';
}
#endif
28 years ago
/*
* LATINn ---> MIC
28 years ago
*/
void
latin2mic(unsigned char *l, unsigned char *p, int len, int lc)
28 years ago
{
int c1;
while (len-- > 0 && (c1 = *l++))
{
if (c1 > 0x7f)
{ /* Latin? */
*p++ = lc;
}
*p++ = c1;
}
*p = '\0';
28 years ago
}
/*
* MIC ---> LATINn
28 years ago
*/
void
mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
28 years ago
{
int c1;
while (len > 0 && (c1 = *mic))
{
len -= pg_mic_mblen(mic++);
if (c1 == lc)
*p++ = *mic++;
else if (c1 > 0x7f)
{
mic--;
pg_print_bogus_char(&mic, &p);
}
else
{ /* should be ASCII */
*p++ = c1;
}
}
*p = '\0';
28 years ago
}
/*
* ASCII ---> MIC
*/
void
pg_ascii2mic(unsigned char *l, unsigned char *p, int len)
{
int c1;
while (len-- > 0 && (c1 = *l++))
*p++ = (c1 & 0x7f);
*p = '\0';
}
/*
* MIC ---> ASCII
*/
void
pg_mic2ascii(unsigned char *mic, unsigned char *p, int len)
{
int c1;
while (len-- > 0 && (c1 = *mic))
{
if (c1 > 0x7f)
pg_print_bogus_char(&mic, &p);
else
{ /* should be ASCII */
*p++ = c1;
mic++;
}
}
*p = '\0';
}
/*
* latin2mic_with_table: a generic single byte charset encoding
* conversion from a local charset to the mule internal code.
* with a encoding conversion table.
* the table is ordered according to the local charset,
* starting from 128 (0x80). each entry in the table
* holds the corresponding code point for the mule internal code.
*/
void
latin2mic_with_table(
unsigned char *l, /* local charset string (source) */
unsigned char *p, /* pointer to store mule internal
* code (destination) */
int len, /* length of l */
int lc, /* leading character of p */
unsigned char *tab /* code conversion table */
)
{
unsigned char c1,
c2;
while (len-- > 0 && (c1 = *l++))
{
if (c1 < 128)
*p++ = c1;
else
{
c2 = tab[c1 - 128];
if (c2)
{
*p++ = lc;
*p++ = c2;
}
else
{
*p++ = ' '; /* cannot convert */
}
}
}
*p = '\0';
}
/*
* mic2latin_with_table: a generic single byte charset encoding
* conversion from the mule internal code to a local charset
* with a encoding conversion table.
* the table is ordered according to the second byte of the mule
* internal code starting from 128 (0x80).
* each entry in the table
* holds the corresponding code point for the local code.
*/
void
mic2latin_with_table(
unsigned char *mic, /* mule internal code
* (source) */
unsigned char *p, /* local code (destination) */
int len, /* length of p */
int lc, /* leading character */
unsigned char *tab /* code conversion table */
)
{
unsigned char c1,
c2;
while (len-- > 0 && (c1 = *mic++))
{
if (c1 < 128)
*p++ = c1;
else if (c1 == lc)
{
c1 = *mic++;
len--;
c2 = tab[c1 - 128];
if (c2)
*p++ = c2;
else
{
*p++ = ' '; /* cannot convert */
}
}
else
{
*p++ = ' '; /* bogus character */
}
}
*p = '\0';
}
/*
* comparison routine for bsearch()
* this routine is intended for UTF-8 -> local code
*/
static int
compare1(const void *p1, const void *p2)
{
unsigned int v1,
v2;
v1 = *(unsigned int *) p1;
v2 = ((pg_utf_to_local *) p2)->utf;
return (v1 > v2)?1:((v1 == v2)?0:-1);
}
/*
* comparison routine for bsearch()
* this routine is intended for local code -> UTF-8
*/
static int
compare2(const void *p1, const void *p2)
{
unsigned int v1,
v2;
v1 = *(unsigned int *) p1;
v2 = ((pg_local_to_utf *) p2)->code;
return (v1 > v2)?1:((v1 == v2)?0:-1);
}
/*
* UTF-8 ---> local code
*
* utf: input UTF-8 string. Its length is limited by "len" parameter
* or a null terminator.
* iso: pointer to the output.
* map: the conversion map.
* size: the size of the conversion map.
*/
void
UtfToLocal(unsigned char *utf, unsigned char *iso,
pg_utf_to_local *map, int size, int len)
{
unsigned int iutf;
int l;
pg_utf_to_local *p;
for (; len > 0 && *utf; len -= l)
{
l = pg_utf_mblen(utf);
if (l == 1)
{
*iso++ = *utf++;
continue;
}
else if (l == 2)
{
iutf = *utf++ << 8;
iutf |= *utf++;
}
else
{
iutf = *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
p = bsearch(&iutf, map, size,
sizeof(pg_utf_to_local), compare1);
if (p == NULL)
{
elog(WARNING, "UtfToLocal: could not convert UTF-8 (0x%04x). Ignored", iutf);
continue;
}
if (p->code & 0xff000000)
*iso++ = p->code >> 24;
if (p->code & 0x00ff0000)
*iso++ = (p->code & 0x00ff0000) >> 16;
if (p->code & 0x0000ff00)
*iso++ = (p->code & 0x0000ff00) >> 8;
if (p->code & 0x000000ff)
*iso++ = p->code & 0x000000ff;
}
*iso = '\0';
}
/*
* local code ---> UTF-8
*/
void
LocalToUtf(unsigned char *iso, unsigned char *utf,
23 years ago
pg_local_to_utf *map, int size, int encoding, int len)
{
unsigned int iiso;
int l;
pg_local_to_utf *p;
Commit Karel's patch. ------------------------------------------------------------------- Subject: Re: [PATCHES] encoding names From: Karel Zak <zakkr@zf.jcu.cz> To: Peter Eisentraut <peter_e@gmx.net> Cc: pgsql-patches <pgsql-patches@postgresql.org> Date: Fri, 31 Aug 2001 17:24:38 +0200 On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote: > > - convert encoding 'name' to 'id' > > I thought we decided not to add functions returning "new" names until we > know exactly what the new names should be, and pending schema Ok, the patch not to add functions. > better > > ...(): encoding name too long Fixed. I found new bug in command/variable.c in parse_client_encoding(), nobody probably never see this error: if (pg_set_client_encoding(encoding)) { elog(ERROR, "Conversion between %s and %s is not supported", value, GetDatabaseEncodingName()); } because pg_set_client_encoding() returns -1 for error and 0 as true. It's fixed too. IMHO it can be apply. Karel PS: * following files are renamed: src/utils/mb/Unicode/KOI8_to_utf8.map --> src/utils/mb/Unicode/koi8r_to_utf8.map src/utils/mb/Unicode/WIN_to_utf8.map --> src/utils/mb/Unicode/win1251_to_utf8.map src/utils/mb/Unicode/utf8_to_KOI8.map --> src/utils/mb/Unicode/utf8_to_koi8r.map src/utils/mb/Unicode/utf8_to_WIN.map --> src/utils/mb/Unicode/utf8_to_win1251.map * new file: src/utils/mb/encname.c * removed file: src/utils/mb/common.c -- Karel Zak <zakkr@zf.jcu.cz> http://home.zf.jcu.cz/~zakkr/ C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
24 years ago
if (!PG_VALID_ENCODING(encoding))
elog(ERROR, "Invalid encoding number %d", encoding);
for (; len > 0 && *iso; len -= l)
{
if (*iso < 0x80)
{
*utf++ = *iso++;
l = 1;
continue;
}
l = pg_encoding_mblen(encoding, iso);
if (l == 1)
iiso = *iso++;
else if (l == 2)
{
iiso = *iso++ << 8;
iiso |= *iso++;
}
else if (l == 3)
{
iiso = *iso++ << 16;
iiso |= *iso++ << 8;
iiso |= *iso++;
}
else if (l == 4)
{
iiso = *iso++ << 24;
iiso |= *iso++ << 16;
iiso |= *iso++ << 8;
iiso |= *iso++;
}
p = bsearch(&iiso, map, size,
sizeof(pg_local_to_utf), compare2);
if (p == NULL)
{
elog(WARNING, "LocalToUtf: could not convert (0x%04x) %s to UTF-8. Ignored",
iiso, (&pg_enc2name_tbl[encoding])->name);
continue;
}
if (p->utf & 0xff000000)
*utf++ = p->utf >> 24;
if (p->utf & 0x00ff0000)
*utf++ = (p->utf & 0x00ff0000) >> 16;
if (p->utf & 0x0000ff00)
*utf++ = (p->utf & 0x0000ff00) >> 8;
if (p->utf & 0x000000ff)
*utf++ = p->utf & 0x000000ff;
}
*utf = '\0';
}