ICU: use UTF8-optimized case conversion API

Initializes a UCaseMap object once for use across calls, and uses
UTF8-optimized APIs.

Author: Andreas Karlsson <andreas@proxel.se>
Reviewed-by: zengman <zengman@halodbtech.com>
Discussion: https://postgr.es/m/5a010b27-8ed9-4739-86fe-1562b07ba564@proxel.se
master
Jeff Davis 1 week ago
parent 0547aeae0f
commit c4ff35f104
  1. 291
      src/backend/utils/adt/pg_locale_icu.c
  2. 2
      src/common/unicode/case_test.c
  3. 2
      src/include/utils/pg_locale.h
  4. 1
      src/tools/pgindent/typedefs.list

@ -52,6 +52,7 @@ extern pg_locale_t create_pg_locale_icu(Oid collid, MemoryContext context);
#ifdef USE_ICU
extern UCollator *pg_ucol_open(const char *loc_str);
static UCaseMap *pg_ucasemap_open(const char *loc_str);
static size_t strlower_icu(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
@ -61,6 +62,14 @@ static size_t strupper_icu(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
static size_t strfold_icu(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src,
ssize_t srclen, pg_locale_t locale);
static int strncoll_icu(const char *arg1, ssize_t len1,
@ -111,9 +120,9 @@ static size_t icu_from_uchar(char *dest, size_t destsize,
const UChar *buff_uchar, int32_t len_uchar);
static void icu_set_collation_attributes(UCollator *collator, const char *loc,
UErrorCode *status);
static int32_t icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
UChar **buff_dest, UChar *buff_source,
int32_t len_source);
static int32_t icu_convert_case(ICU_Convert_Func func, char *dest,
size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
@ -122,6 +131,7 @@ static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode);
static int32_t foldcase_options(const char *locale);
/*
* XXX: many of the functions below rely on casts directly from pg_wchar to
@ -245,6 +255,28 @@ static const struct ctype_methods ctype_methods_icu = {
.wc_tolower = tolower_icu,
};
static const struct ctype_methods ctype_methods_icu_utf8 = {
.strlower = strlower_icu_utf8,
.strtitle = strtitle_icu_utf8,
.strupper = strupper_icu_utf8,
.strfold = strfold_icu_utf8,
/* uses plain ASCII semantics for historical reasons */
.downcase_ident = NULL,
.wc_isdigit = wc_isdigit_icu,
.wc_isalpha = wc_isalpha_icu,
.wc_isalnum = wc_isalnum_icu,
.wc_isupper = wc_isupper_icu,
.wc_islower = wc_islower_icu,
.wc_isgraph = wc_isgraph_icu,
.wc_isprint = wc_isprint_icu,
.wc_ispunct = wc_ispunct_icu,
.wc_isspace = wc_isspace_icu,
.wc_isxdigit = wc_isxdigit_icu,
.wc_iscased = wc_iscased_icu,
.wc_toupper = toupper_icu,
.wc_tolower = tolower_icu,
};
/*
* ICU still depends on libc for compatibility with certain historical
* behavior for single-byte encodings. See downcase_ident_icu().
@ -347,10 +379,16 @@ create_pg_locale_icu(Oid collid, MemoryContext context)
result->collate_is_c = false;
result->ctype_is_c = false;
if (GetDatabaseEncoding() == PG_UTF8)
{
result->icu.ucasemap = pg_ucasemap_open(iculocstr);
result->collate = &collate_methods_icu_utf8;
result->ctype = &ctype_methods_icu_utf8;
}
else
{
result->collate = &collate_methods_icu;
result->ctype = &ctype_methods_icu;
result->ctype = &ctype_methods_icu;
}
return result;
#else
@ -366,19 +404,15 @@ create_pg_locale_icu(Oid collid, MemoryContext context)
#ifdef USE_ICU
/*
* Wrapper around ucol_open() to handle API differences for older ICU
* versions.
* Check locale string and fix it if necessary. Returns a new palloc'd string.
*
* Ensure that no path leaks a UCollator.
* In ICU versions 54 and earlier, "und" is not a recognized spelling of the
* root locale. If the first component of the locale is "und", replace with
* "root" before opening.
*/
UCollator *
pg_ucol_open(const char *loc_str)
static char *
fix_icu_locale_str(const char *loc_str)
{
UCollator *collator;
UErrorCode status;
const char *orig_str = loc_str;
char *fixed_str = NULL;
/*
* Must never open default collator, because it depends on the environment
* and may change at any time. Should not happen, but check here to catch
@ -391,16 +425,11 @@ pg_ucol_open(const char *loc_str)
if (loc_str == NULL)
elog(ERROR, "opening default collator is not supported");
/*
* In ICU versions 54 and earlier, "und" is not a recognized spelling of
* the root locale. If the first component of the locale is "und", replace
* with "root" before opening.
*/
if (U_ICU_VERSION_MAJOR_NUM < 55)
{
char lang[ULOC_LANG_CAPACITY];
UErrorCode status = U_ZERO_ERROR;
status = U_ZERO_ERROR;
uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
{
@ -413,28 +442,47 @@ pg_ucol_open(const char *loc_str)
if (strcmp(lang, "und") == 0)
{
const char *remainder = loc_str + strlen("und");
char *fixed_str;
fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
strcpy(fixed_str, "root");
strcat(fixed_str, remainder);
loc_str = fixed_str;
return fixed_str;
}
}
return pstrdup(loc_str);
}
/*
* Wrapper around ucol_open() to handle API differences for older ICU
* versions.
*
* Ensure that no path leaks a UCollator.
*/
UCollator *
pg_ucol_open(const char *loc_str)
{
UCollator *collator;
UErrorCode status;
char *fixed_str;
fixed_str = fix_icu_locale_str(loc_str);
status = U_ZERO_ERROR;
collator = ucol_open(loc_str, &status);
collator = ucol_open(fixed_str, &status);
if (U_FAILURE(status))
ereport(ERROR,
/* use original string for error report */
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("could not open collator for locale \"%s\": %s",
orig_str, u_errorName(status))));
loc_str, u_errorName(status))));
if (U_ICU_VERSION_MAJOR_NUM < 54)
{
status = U_ZERO_ERROR;
icu_set_collation_attributes(collator, loc_str, &status);
icu_set_collation_attributes(collator, fixed_str, &status);
/*
* Pretend the error came from ucol_open(), for consistent error
@ -446,16 +494,43 @@ pg_ucol_open(const char *loc_str)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("could not open collator for locale \"%s\": %s",
orig_str, u_errorName(status))));
loc_str, u_errorName(status))));
}
}
if (fixed_str != NULL)
pfree(fixed_str);
pfree(fixed_str);
return collator;
}
/*
* Wrapper around ucasemap_open() to handle API differences for older ICU
* versions.
*
* Additionally makes sure we get the right options for case folding.
*/
static UCaseMap *
pg_ucasemap_open(const char *loc_str)
{
UErrorCode status = U_ZERO_ERROR;
UCaseMap *casemap;
char *fixed_str;
fixed_str = fix_icu_locale_str(loc_str);
casemap = ucasemap_open(fixed_str, foldcase_options(fixed_str), &status);
if (U_FAILURE(status))
/* use original string for error report */
ereport(ERROR,
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("could not open casemap for locale \"%s\": %s",
loc_str, u_errorName(status)));
pfree(fixed_str);
return casemap;
}
/*
* Create a UCollator with the given locale string and rules.
*
@ -528,80 +603,84 @@ static size_t
strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
int32_t len_uchar;
int32_t len_conv;
UChar *buff_uchar;
UChar *buff_conv;
size_t result_len;
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
len_conv = icu_convert_case(u_strToLower, locale,
&buff_conv, buff_uchar, len_uchar);
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
pfree(buff_uchar);
pfree(buff_conv);
return result_len;
return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale);
}
static size_t
strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
int32_t len_uchar;
int32_t len_conv;
UChar *buff_uchar;
UChar *buff_conv;
size_t result_len;
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
len_conv = icu_convert_case(u_strToTitle_default_BI, locale,
&buff_conv, buff_uchar, len_uchar);
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
pfree(buff_uchar);
pfree(buff_conv);
return result_len;
return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale);
}
static size_t
strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
int32_t len_uchar;
int32_t len_conv;
UChar *buff_uchar;
UChar *buff_conv;
size_t result_len;
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
len_conv = icu_convert_case(u_strToUpper, locale,
&buff_conv, buff_uchar, len_uchar);
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
pfree(buff_uchar);
pfree(buff_conv);
return result_len;
return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale);
}
static size_t
strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
int32_t len_uchar;
int32_t len_conv;
UChar *buff_uchar;
UChar *buff_conv;
size_t result_len;
return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale);
}
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
len_conv = icu_convert_case(u_strFoldCase_default, locale,
&buff_conv, buff_uchar, len_uchar);
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
pfree(buff_uchar);
pfree(buff_conv);
static size_t
strlower_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
UErrorCode status = U_ZERO_ERROR;
int32_t needed;
return result_len;
needed = ucasemap_utf8ToLower(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
ereport(ERROR,
errmsg("case conversion failed: %s", u_errorName(status)));
return needed;
}
static size_t
strtitle_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
UErrorCode status = U_ZERO_ERROR;
int32_t needed;
needed = ucasemap_utf8ToTitle(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
ereport(ERROR,
errmsg("case conversion failed: %s", u_errorName(status)));
return needed;
}
static size_t
strupper_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
UErrorCode status = U_ZERO_ERROR;
int32_t needed;
needed = ucasemap_utf8ToUpper(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
ereport(ERROR,
errmsg("case conversion failed: %s", u_errorName(status)));
return needed;
}
static size_t
strfold_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
UErrorCode status = U_ZERO_ERROR;
int32_t needed;
needed = ucasemap_utf8FoldCase(locale->icu.ucasemap, dest, destsize, src, srclen, &status);
if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
ereport(ERROR,
errmsg("case conversion failed: %s", u_errorName(status)));
return needed;
}
/*
@ -829,8 +908,8 @@ icu_from_uchar(char *dest, size_t destsize, const UChar *buff_uchar, int32_t len
}
static int32_t
icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
UChar **buff_dest, UChar *buff_source, int32_t len_source)
convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale,
UChar **buff_dest, UChar *buff_source, int32_t len_source)
{
UErrorCode status;
int32_t len_dest;
@ -855,6 +934,26 @@ icu_convert_case(ICU_Convert_Func func, pg_locale_t mylocale,
return len_dest;
}
static int32_t
icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize,
const char *src, ssize_t srclen, pg_locale_t locale)
{
int32_t len_uchar;
int32_t len_conv;
UChar *buff_uchar;
UChar *buff_conv;
size_t result_len;
len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
len_conv = convert_case_uchar(func, locale, &buff_conv,
buff_uchar, len_uchar);
result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
pfree(buff_uchar);
pfree(buff_conv);
return result_len;
}
static int32_t
u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
@ -870,18 +969,25 @@ u_strFoldCase_default(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
const char *locale,
UErrorCode *pErrorCode)
{
return u_strFoldCase(dest, destCapacity, src, srcLength,
foldcase_options(locale), pErrorCode);
}
/*
* Return the correct u_strFoldCase() options for the given locale.
*
* Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
* folding does not accept a locale. Instead it just supports a single option
* relevant to Turkic languages 'az' and 'tr'; check for those languages.
*/
static int32_t
foldcase_options(const char *locale)
{
uint32 options = U_FOLD_CASE_DEFAULT;
char lang[3];
UErrorCode status;
UErrorCode status = U_ZERO_ERROR;
/*
* Unlike the ICU APIs for lowercasing, titlecasing, and uppercasing, case
* folding does not accept a locale. Instead it just supports a single
* option relevant to Turkic languages 'az' and 'tr'; check for those
* languages to enable the option.
*/
status = U_ZERO_ERROR;
uloc_getLanguage(locale, lang, 3, &status);
if (U_SUCCESS(status))
{
@ -893,8 +999,7 @@ u_strFoldCase_default(UChar *dest, int32_t destCapacity,
options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
}
return u_strFoldCase(dest, destCapacity, src, srcLength,
options, pErrorCode);
return options;
}
/*

@ -30,7 +30,7 @@
#define BUFSZ 256
#ifdef USE_ICU
static UCaseMap * casemap = NULL;
static UCaseMap *casemap = NULL;
#endif
typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src,

@ -21,6 +21,7 @@
#undef U_SHOW_CPLUSPLUS_HEADER_API
#define U_SHOW_CPLUSPLUS_HEADER_API 0
#include <unicode/ucol.h>
#include <unicode/ucasemap.h>
#endif
/* use for libc locale names */
@ -168,6 +169,7 @@ struct pg_locale_struct
const char *locale;
UCollator *ucol;
locale_t lt;
UCaseMap *ucasemap;
} icu;
#endif
};

@ -3190,6 +3190,7 @@ TypeName
TzAbbrevCache
U32
U8
UCaseMap
UChar
UCharIterator
UColAttributeValue

Loading…
Cancel
Save