You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
postgres/src/backend/utils/mb/conv.c

736 lines
16 KiB

/*-------------------------------------------------------------------------
*
* Utility functions for conversion procs.
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/utils/mb/conv.c
*
*-------------------------------------------------------------------------
28 years ago
*/
#include "postgres.h"
#include "mb/pg_wchar.h"
28 years ago
Avoid doing encoding conversions by double-conversion via MULE_INTERNAL. Previously, we did many conversions for Cyrillic and Central European single-byte encodings by converting to a related MULE_INTERNAL coding scheme before converting to the destination. This seems unnecessarily inefficient. Moreover, if the conversion encounters an untranslatable character, the error message will confusingly complain about failure to convert to or from MULE_INTERNAL, rather than the user-visible encodings. Worse still, this approach results in some completely unnecessary conversion failures; there are cases where the chosen MULE subset lacks characters that exist in both of the user-visible encodings, causing a conversion failure that need not occur. This patch fixes the first two of those deficiencies by introducing a new local2local() conversion support subroutine for direct conversion between any two single-byte character sets, and adding new conversion tables where needed. However, I generated the new conversion tables by testing PG 9.5's behavior, so that the actual conversion behavior is bug-compatible with previous releases; the only user-visible behavior change is that the error messages for conversion failures are saner. Changes in the conversion behavior will probably ensue after discussion. Interestingly, although this approach requires more tables, the .so files actually end up smaller (at least on my x86_64 machine); the tables are smaller than the management code needed for double conversion. Per a complaint from Albe Laurenz.
10 years ago
/*
* local2local: a generic single byte charset encoding
* conversion between two ASCII-superset encodings.
*
* l points to the source string of length len
* p is the output area (must be large enough!)
* src_encoding is the PG identifier for the source encoding
* dest_encoding is the PG identifier for the target encoding
* tab holds conversion entries for the source charset
* starting from 128 (0x80). each entry in the table holds the corresponding
* code point for the target charset, or 0 if there is no equivalent code.
*/
void
local2local(const unsigned char *l,
unsigned char *p,
int len,
int src_encoding,
int dest_encoding,
const unsigned char *tab)
{
unsigned char c1,
c2;
while (len > 0)
{
c1 = *l;
if (c1 == 0)
report_invalid_encoding(src_encoding, (const char *) l, len);
if (!IS_HIGHBIT_SET(c1))
*p++ = c1;
else
{
c2 = tab[c1 - HIGHBIT];
if (c2)
*p++ = c2;
else
report_untranslatable_char(src_encoding, dest_encoding,
(const char *) l, len);
}
l++;
len--;
}
*p = '\0';
}
/*
* LATINn ---> MIC when the charset's local codes map directly to MIC
*
* l points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
28 years ago
*/
void
latin2mic(const unsigned char *l, unsigned char *p, int len,
int lc, int encoding)
28 years ago
{
int c1;
while (len > 0)
{
c1 = *l;
if (c1 == 0)
report_invalid_encoding(encoding, (const char *) l, len);
if (IS_HIGHBIT_SET(c1))
*p++ = lc;
*p++ = c1;
l++;
len--;
}
*p = '\0';
28 years ago
}
/*
* MIC ---> LATINn when the charset's local codes map directly to MIC
*
* mic points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
28 years ago
*/
void
mic2latin(const unsigned char *mic, unsigned char *p, int len,
int lc, int encoding)
28 years ago
{
int c1;
while (len > 0)
{
c1 = *mic;
if (c1 == 0)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
if (!IS_HIGHBIT_SET(c1))
{
/* easy for ASCII */
*p++ = c1;
mic++;
len--;
}
else
{
int l = pg_mule_mblen(mic);
if (len < l)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
len);
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
(const char *) mic, len);
*p++ = mic[1];
mic += 2;
len -= 2;
}
}
*p = '\0';
28 years ago
}
/*
* latin2mic_with_table: a generic single byte charset encoding
* conversion from a local charset to the mule internal code.
*
* l points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
* tab holds conversion entries for the local charset
Avoid doing encoding conversions by double-conversion via MULE_INTERNAL. Previously, we did many conversions for Cyrillic and Central European single-byte encodings by converting to a related MULE_INTERNAL coding scheme before converting to the destination. This seems unnecessarily inefficient. Moreover, if the conversion encounters an untranslatable character, the error message will confusingly complain about failure to convert to or from MULE_INTERNAL, rather than the user-visible encodings. Worse still, this approach results in some completely unnecessary conversion failures; there are cases where the chosen MULE subset lacks characters that exist in both of the user-visible encodings, causing a conversion failure that need not occur. This patch fixes the first two of those deficiencies by introducing a new local2local() conversion support subroutine for direct conversion between any two single-byte character sets, and adding new conversion tables where needed. However, I generated the new conversion tables by testing PG 9.5's behavior, so that the actual conversion behavior is bug-compatible with previous releases; the only user-visible behavior change is that the error messages for conversion failures are saner. Changes in the conversion behavior will probably ensue after discussion. Interestingly, although this approach requires more tables, the .so files actually end up smaller (at least on my x86_64 machine); the tables are smaller than the management code needed for double conversion. Per a complaint from Albe Laurenz.
10 years ago
* starting from 128 (0x80). each entry in the table holds the corresponding
* code point for the mule encoding, or 0 if there is no equivalent code.
*/
void
latin2mic_with_table(const unsigned char *l,
unsigned char *p,
int len,
int lc,
int encoding,
const unsigned char *tab)
{
unsigned char c1,
c2;
while (len > 0)
{
c1 = *l;
if (c1 == 0)
report_invalid_encoding(encoding, (const char *) l, len);
if (!IS_HIGHBIT_SET(c1))
*p++ = c1;
else
{
c2 = tab[c1 - HIGHBIT];
if (c2)
{
*p++ = lc;
*p++ = c2;
}
else
report_untranslatable_char(encoding, PG_MULE_INTERNAL,
(const char *) l, len);
}
l++;
len--;
}
*p = '\0';
}
/*
* mic2latin_with_table: a generic single byte charset encoding
* conversion from the mule internal code to a local charset.
*
* mic points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
Avoid doing encoding conversions by double-conversion via MULE_INTERNAL. Previously, we did many conversions for Cyrillic and Central European single-byte encodings by converting to a related MULE_INTERNAL coding scheme before converting to the destination. This seems unnecessarily inefficient. Moreover, if the conversion encounters an untranslatable character, the error message will confusingly complain about failure to convert to or from MULE_INTERNAL, rather than the user-visible encodings. Worse still, this approach results in some completely unnecessary conversion failures; there are cases where the chosen MULE subset lacks characters that exist in both of the user-visible encodings, causing a conversion failure that need not occur. This patch fixes the first two of those deficiencies by introducing a new local2local() conversion support subroutine for direct conversion between any two single-byte character sets, and adding new conversion tables where needed. However, I generated the new conversion tables by testing PG 9.5's behavior, so that the actual conversion behavior is bug-compatible with previous releases; the only user-visible behavior change is that the error messages for conversion failures are saner. Changes in the conversion behavior will probably ensue after discussion. Interestingly, although this approach requires more tables, the .so files actually end up smaller (at least on my x86_64 machine); the tables are smaller than the management code needed for double conversion. Per a complaint from Albe Laurenz.
10 years ago
* tab holds conversion entries for the mule internal code's second byte,
* starting from 128 (0x80). each entry in the table holds the corresponding
* code point for the local charset, or 0 if there is no equivalent code.
*/
void
mic2latin_with_table(const unsigned char *mic,
unsigned char *p,
int len,
int lc,
int encoding,
const unsigned char *tab)
{
unsigned char c1,
c2;
while (len > 0)
{
c1 = *mic;
if (c1 == 0)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
if (!IS_HIGHBIT_SET(c1))
{
/* easy for ASCII */
*p++ = c1;
mic++;
len--;
}
else
{
int l = pg_mule_mblen(mic);
if (len < l)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
len);
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
(c2 = tab[mic[1] - HIGHBIT]) == 0)
{
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
(const char *) mic, len);
break; /* keep compiler quiet */
}
*p++ = c2;
mic += 2;
len -= 2;
}
}
*p = '\0';
}
/*
* comparison routine for bsearch()
* this routine is intended for combined UTF8 -> local code
*/
static int
compare3(const void *p1, const void *p2)
{
uint32 s1,
s2,
d1,
d2;
s1 = *(const uint32 *) p1;
s2 = *((const uint32 *) p1 + 1);
d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
}
/*
* comparison routine for bsearch()
* this routine is intended for local code -> combined UTF8
*/
static int
compare4(const void *p1, const void *p2)
{
uint32 v1,
v2;
v1 = *(const uint32 *) p1;
v2 = ((const pg_local_to_utf_combined *) p2)->code;
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
/*
* store 32bit character representation into multibyte stream
*/
static inline unsigned char *
store_coded_char(unsigned char *dest, uint32 code)
{
if (code & 0xff000000)
*dest++ = code >> 24;
if (code & 0x00ff0000)
*dest++ = code >> 16;
if (code & 0x0000ff00)
*dest++ = code >> 8;
if (code & 0x000000ff)
*dest++ = code;
return dest;
}
/*
* Convert a character using a conversion radix tree.
*
* 'l' is the length of the input character in bytes, and b1-b4 are
* the input character's bytes.
*/
static inline uint32
pg_mb_radix_conv(const pg_mb_radix_tree *rt,
int l,
unsigned char b1,
unsigned char b2,
unsigned char b3,
unsigned char b4)
{
if (l == 4)
{
/* 4-byte code */
/* check code validity */
if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
return 0;
/* perform lookup */
if (rt->chars32)
{
uint32 idx = rt->b4root;
idx = rt->chars32[b1 + idx - rt->b4_1_lower];
idx = rt->chars32[b2 + idx - rt->b4_2_lower];
idx = rt->chars32[b3 + idx - rt->b4_3_lower];
return rt->chars32[b4 + idx - rt->b4_4_lower];
}
else
{
uint16 idx = rt->b4root;
idx = rt->chars16[b1 + idx - rt->b4_1_lower];
idx = rt->chars16[b2 + idx - rt->b4_2_lower];
idx = rt->chars16[b3 + idx - rt->b4_3_lower];
return rt->chars16[b4 + idx - rt->b4_4_lower];
}
}
else if (l == 3)
{
/* 3-byte code */
/* check code validity */
if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
return 0;
/* perform lookup */
if (rt->chars32)
{
uint32 idx = rt->b3root;
idx = rt->chars32[b2 + idx - rt->b3_1_lower];
idx = rt->chars32[b3 + idx - rt->b3_2_lower];
return rt->chars32[b4 + idx - rt->b3_3_lower];
}
else
{
uint16 idx = rt->b3root;
idx = rt->chars16[b2 + idx - rt->b3_1_lower];
idx = rt->chars16[b3 + idx - rt->b3_2_lower];
return rt->chars16[b4 + idx - rt->b3_3_lower];
}
}
else if (l == 2)
{
/* 2-byte code */
/* check code validity - first byte */
if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
return 0;
/* perform lookup */
if (rt->chars32)
{
uint32 idx = rt->b2root;
idx = rt->chars32[b3 + idx - rt->b2_1_lower];
return rt->chars32[b4 + idx - rt->b2_2_lower];
}
else
{
uint16 idx = rt->b2root;
idx = rt->chars16[b3 + idx - rt->b2_1_lower];
return rt->chars16[b4 + idx - rt->b2_2_lower];
}
}
else if (l == 1)
{
/* 1-byte code */
/* check code validity - first byte */
if (b4 < rt->b1_lower || b4 > rt->b1_upper)
return 0;
/* perform lookup */
if (rt->chars32)
return rt->chars32[b4 + rt->b1root - rt->b1_lower];
else
return rt->chars16[b4 + rt->b1root - rt->b1_lower];
}
return 0; /* shouldn't happen */
}
/*
* UTF8 ---> local code
*
* utf: input string in UTF8 encoding (need not be null-terminated)
* len: length of input string (in bytes)
* iso: pointer to the output area (must be large enough!)
(output string will be null-terminated)
* map: conversion map for single characters
* cmap: conversion map for combined characters
* (optional, pass NULL if none)
* cmapsize: number of entries in the conversion map for combined characters
* (optional, pass 0 if none)
* conv_func: algorithmic encoding conversion function
* (optional, pass NULL if none)
* encoding: PG identifier for the local encoding
*
* For each character, the cmap (if provided) is consulted first; if no match,
* the map is consulted next; if still no match, the conv_func (if provided)
* is applied. An error is raised if no match is found.
*
* See pg_wchar.h for more details about the data structures used here.
*/
void
UtfToLocal(const unsigned char *utf, int len,
unsigned char *iso,
const pg_mb_radix_tree *map,
const pg_utf_to_local_combined *cmap, int cmapsize,
utf_local_conversion_func conv_func,
int encoding)
{
uint32 iutf;
int l;
const pg_utf_to_local_combined *cp;
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid encoding number: %d", encoding)));
for (; len > 0; len -= l)
{
unsigned char b1 = 0;
unsigned char b2 = 0;
unsigned char b3 = 0;
unsigned char b4 = 0;
/* "break" cases all represent errors */
if (*utf == '\0')
break;
l = pg_utf_mblen(utf);
if (len < l)
break;
if (!pg_utf8_islegal(utf, l))
break;
if (l == 1)
{
/* ASCII case is easy, assume it's one-to-one conversion */
*iso++ = *utf++;
continue;
}
/* collect coded char of length l */
if (l == 2)
{
b3 = *utf++;
b4 = *utf++;
}
else if (l == 3)
{
b2 = *utf++;
b3 = *utf++;
b4 = *utf++;
}
else if (l == 4)
{
b1 = *utf++;
b2 = *utf++;
b3 = *utf++;
b4 = *utf++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iutf = 0; /* keep compiler quiet */
}
iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
/* First, try with combined map if possible */
if (cmap && len > l)
{
const unsigned char *utf_save = utf;
int len_save = len;
int l_save = l;
/* collect next character, same as above */
len -= l;
l = pg_utf_mblen(utf);
if (len < l)
break;
if (!pg_utf8_islegal(utf, l))
break;
/* We assume ASCII character cannot be in combined map */
if (l > 1)
{
uint32 iutf2;
uint32 cutf[2];
if (l == 2)
{
iutf2 = *utf++ << 8;
iutf2 |= *utf++;
}
else if (l == 3)
{
iutf2 = *utf++ << 16;
iutf2 |= *utf++ << 8;
iutf2 |= *utf++;
}
else if (l == 4)
{
iutf2 = *utf++ << 24;
iutf2 |= *utf++ << 16;
iutf2 |= *utf++ << 8;
iutf2 |= *utf++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iutf2 = 0; /* keep compiler quiet */
}
cutf[0] = iutf;
cutf[1] = iutf2;
cp = bsearch(cutf, cmap, cmapsize,
sizeof(pg_utf_to_local_combined), compare3);
if (cp)
{
iso = store_coded_char(iso, cp->code);
continue;
}
}
/* fail, so back up to reprocess second character next time */
utf = utf_save;
len = len_save;
l = l_save;
}
/* Now check ordinary map */
if (map)
{
uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
if (converted)
{
iso = store_coded_char(iso, converted);
continue;
}
}
/* if there's a conversion function, try that */
if (conv_func)
{
uint32 converted = (*conv_func) (iutf);
if (converted)
{
iso = store_coded_char(iso, converted);
continue;
}
}
/* failed to translate this character */
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf - l), len);
}
/* if we broke out of loop early, must be invalid input */
if (len > 0)
report_invalid_encoding(PG_UTF8, (const char *) utf, len);
*iso = '\0';
}
/*
* local code ---> UTF8
*
* iso: input string in local encoding (need not be null-terminated)
* len: length of input string (in bytes)
* utf: pointer to the output area (must be large enough!)
(output string will be null-terminated)
* map: conversion map for single characters
* cmap: conversion map for combined characters
* (optional, pass NULL if none)
* cmapsize: number of entries in the conversion map for combined characters
* (optional, pass 0 if none)
* conv_func: algorithmic encoding conversion function
* (optional, pass NULL if none)
* encoding: PG identifier for the local encoding
*
* For each character, the map is consulted first; if no match, the cmap
* (if provided) is consulted next; if still no match, the conv_func
* (if provided) is applied. An error is raised if no match is found.
*
* See pg_wchar.h for more details about the data structures used here.
*/
void
LocalToUtf(const unsigned char *iso, int len,
unsigned char *utf,
const pg_mb_radix_tree *map,
const pg_local_to_utf_combined *cmap, int cmapsize,
utf_local_conversion_func conv_func,
int encoding)
{
uint32 iiso;
int l;
const pg_local_to_utf_combined *cp;
Commit Karel's patch. ------------------------------------------------------------------- Subject: Re: [PATCHES] encoding names From: Karel Zak <zakkr@zf.jcu.cz> To: Peter Eisentraut <peter_e@gmx.net> Cc: pgsql-patches <pgsql-patches@postgresql.org> Date: Fri, 31 Aug 2001 17:24:38 +0200 On Thu, Aug 30, 2001 at 01:30:40AM +0200, Peter Eisentraut wrote: > > - convert encoding 'name' to 'id' > > I thought we decided not to add functions returning "new" names until we > know exactly what the new names should be, and pending schema Ok, the patch not to add functions. > better > > ...(): encoding name too long Fixed. I found new bug in command/variable.c in parse_client_encoding(), nobody probably never see this error: if (pg_set_client_encoding(encoding)) { elog(ERROR, "Conversion between %s and %s is not supported", value, GetDatabaseEncodingName()); } because pg_set_client_encoding() returns -1 for error and 0 as true. It's fixed too. IMHO it can be apply. Karel PS: * following files are renamed: src/utils/mb/Unicode/KOI8_to_utf8.map --> src/utils/mb/Unicode/koi8r_to_utf8.map src/utils/mb/Unicode/WIN_to_utf8.map --> src/utils/mb/Unicode/win1251_to_utf8.map src/utils/mb/Unicode/utf8_to_KOI8.map --> src/utils/mb/Unicode/utf8_to_koi8r.map src/utils/mb/Unicode/utf8_to_WIN.map --> src/utils/mb/Unicode/utf8_to_win1251.map * new file: src/utils/mb/encname.c * removed file: src/utils/mb/common.c -- Karel Zak <zakkr@zf.jcu.cz> http://home.zf.jcu.cz/~zakkr/ C, PostgreSQL, PHP, WWW, http://docs.linux.cz, http://mape.jcu.cz
24 years ago
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid encoding number: %d", encoding)));
for (; len > 0; len -= l)
{
unsigned char b1 = 0;
unsigned char b2 = 0;
unsigned char b3 = 0;
unsigned char b4 = 0;
/* "break" cases all represent errors */
if (*iso == '\0')
break;
if (!IS_HIGHBIT_SET(*iso))
{
/* ASCII case is easy, assume it's one-to-one conversion */
*utf++ = *iso++;
l = 1;
continue;
}
l = pg_encoding_verifymb(encoding, (const char *) iso, len);
if (l < 0)
break;
/* collect coded char of length l */
if (l == 1)
b4 = *iso++;
else if (l == 2)
{
b3 = *iso++;
b4 = *iso++;
}
else if (l == 3)
{
b2 = *iso++;
b3 = *iso++;
b4 = *iso++;
}
else if (l == 4)
{
b1 = *iso++;
b2 = *iso++;
b3 = *iso++;
b4 = *iso++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iiso = 0; /* keep compiler quiet */
}
iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
if (map)
{
uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
if (converted)
{
utf = store_coded_char(utf, converted);
continue;
}
/* If there's a combined character map, try that */
if (cmap)
{
cp = bsearch(&iiso, cmap, cmapsize,
sizeof(pg_local_to_utf_combined), compare4);
if (cp)
{
utf = store_coded_char(utf, cp->utf1);
utf = store_coded_char(utf, cp->utf2);
continue;
}
}
}
/* if there's a conversion function, try that */
if (conv_func)
{
uint32 converted = (*conv_func) (iiso);
if (converted)
{
utf = store_coded_char(utf, converted);
continue;
}
}
/* failed to translate this character */
report_untranslatable_char(encoding, PG_UTF8,
(const char *) (iso - l), len);
}
/* if we broke out of loop early, must be invalid input */
if (len > 0)
report_invalid_encoding(encoding, (const char *) iso, len);
*utf = '\0';
}