You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
postgres/src/include/utils/pg_locale.h

222 lines
7.4 KiB

/*-----------------------------------------------------------------------
*
* PostgreSQL locale utilities
*
* src/include/utils/pg_locale.h
*
* Copyright (c) 2002-2025, PostgreSQL Global Development Group
*
*-----------------------------------------------------------------------
*/
#ifndef _PG_LOCALE_
#define _PG_LOCALE_
#include "mb/pg_wchar.h"
#ifdef USE_ICU
/* only include the C APIs, to avoid errors in cpluspluscheck */
#undef U_SHOW_CPLUSPLUS_API
#define U_SHOW_CPLUSPLUS_API 0
#include <unicode/ucol.h>
#endif
/* use for libc locale names */
#define LOCALE_NAME_BUFLEN 128
/* GUC settings */
extern PGDLLIMPORT char *locale_messages;
extern PGDLLIMPORT char *locale_monetary;
extern PGDLLIMPORT char *locale_numeric;
extern PGDLLIMPORT char *locale_time;
extern PGDLLIMPORT int icu_validation_level;
/* lc_time localization cache */
extern PGDLLIMPORT char *localized_abbrev_days[];
extern PGDLLIMPORT char *localized_full_days[];
extern PGDLLIMPORT char *localized_abbrev_months[];
extern PGDLLIMPORT char *localized_full_months[];
/* is the databases's LC_CTYPE the C locale? */
extern PGDLLIMPORT bool database_ctype_is_c;
extern bool check_locale(int category, const char *locale, char **canonname);
extern char *pg_perm_setlocale(int category, const char *locale);
/*
* Return the POSIX lconv struct (contains number/money formatting
* information) with locale information for all categories.
*/
extern struct lconv *PGLC_localeconv(void);
extern void cache_locale_time(void);
struct pg_locale_struct;
typedef struct pg_locale_struct *pg_locale_t;
/* methods that define collation behavior */
struct collate_methods
{
/* required */
int (*strncoll) (const char *arg1, ssize_t len1,
const char *arg2, ssize_t len2,
pg_locale_t locale);
/* required */
size_t (*strnxfrm) (char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
/* optional */
size_t (*strnxfrm_prefix) (char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
/*
* If the strnxfrm method is not trusted to return the correct results,
* set strxfrm_is_safe to false. It set to false, the method will not be
* used in most cases, but the planner still expects it to be there for
* estimation purposes (where incorrect results are acceptable).
*/
bool strxfrm_is_safe;
};
struct ctype_methods
{
/* case mapping: LOWER()/INITCAP()/UPPER() */
size_t (*strlower) (char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
size_t (*strtitle) (char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
size_t (*strupper) (char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
size_t (*strfold) (char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
/* required */
bool (*wc_isdigit) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isalpha) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isalnum) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isupper) (pg_wchar wc, pg_locale_t locale);
bool (*wc_islower) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isgraph) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isprint) (pg_wchar wc, pg_locale_t locale);
bool (*wc_ispunct) (pg_wchar wc, pg_locale_t locale);
bool (*wc_isspace) (pg_wchar wc, pg_locale_t locale);
pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale);
pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale);
/* required */
bool (*char_is_cased) (char ch, pg_locale_t locale);
/*
* Optional. If defined, will only be called for single-byte encodings. If
* not defined, or if the encoding is multibyte, will fall back to
* pg_strlower().
*/
char (*char_tolower) (unsigned char ch, pg_locale_t locale);
/*
* For regex and pattern matching efficiency, the maximum char value
* supported by the above methods. If zero, limit is set by regex code.
*/
pg_wchar max_chr;
};
/*
* We use a discriminated union to hold either a locale_t or an ICU collator.
* pg_locale_t is occasionally checked for truth, so make it a pointer.
*
* Also, hold two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
* (or POSIX), so we can optimize a few code paths in various places. For the
* built-in C and POSIX collations, we can know that without even doing a
* cache lookup, but we want to support aliases for C/POSIX too. For the
* "default" collation, there are separate static cache variables, since
* consulting the pg_collation catalog doesn't tell us what we need.
*
* Note that some code relies on the flags not reporting false negatives
* (that is, saying it's not C when it is). For example, char2wchar()
* could fail if the locale is C, so str_tolower() shouldn't call it
* in that case.
*/
struct pg_locale_struct
{
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
7 years ago
bool deterministic;
bool collate_is_c;
bool ctype_is_c;
bool is_default;
const struct collate_methods *collate; /* NULL if collate_is_c */
const struct ctype_methods *ctype; /* NULL if ctype_is_c */
union
{
struct
{
const char *locale;
bool casemap_full;
} builtin;
locale_t lt;
#ifdef USE_ICU
struct
{
const char *locale;
UCollator *ucol;
} icu;
#endif
} info;
};
extern void init_database_collation(void);
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
extern char *get_collation_actual_version(char collprovider, const char *collcollate);
extern bool char_is_cased(char ch, pg_locale_t locale);
extern bool char_tolower_enabled(pg_locale_t locale);
extern char char_tolower(unsigned char ch, pg_locale_t locale);
extern size_t pg_strlower(char *dst, size_t dstsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
extern size_t pg_strtitle(char *dst, size_t dstsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
extern size_t pg_strupper(char *dst, size_t dstsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
extern size_t pg_strfold(char *dst, size_t dstsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
extern int pg_strncoll(const char *arg1, ssize_t len1,
const char *arg2, ssize_t len2, pg_locale_t locale);
extern bool pg_strxfrm_enabled(pg_locale_t locale);
extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize,
pg_locale_t locale);
extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale);
extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
pg_locale_t locale);
extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale);
extern int builtin_locale_encoding(const char *locale);
extern const char *builtin_validate_locale(int encoding, const char *locale);
extern void icu_validate_locale(const char *loc_str);
Canonicalize ICU locale names to language tags. Convert to BCP47 language tags before storing in the catalog, except during binary upgrade or when the locale comes from an existing collation or template database. The resulting language tags can vary slightly between ICU versions. For instance, "@colBackwards=yes" is converted to "und-u-kb-true" in older versions of ICU, and to the simpler (but equivalent) "und-u-kb" in newer versions. The process of canonicalizing to a language tag also understands more input locale string formats than ucol_open(). For instance, "fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is ignored; effectively treating it the same as the locale "fr" and opening the wrong collator. Canonicalization properly interprets the language and region, resulting in the language tag "fr-CA", which can then be understood by ucol_open(). This commit fixes a problem in prior versions due to ucol_open() misinterpreting locale strings as described above. For instance, creating an ICU collation with locale "fr_CA.UTF-8" would store that string directly in the catalog, which would later be passed to (and misinterpreted by) ucol_open(). After this commit, the locale string will be canonicalized to language tag "fr-CA" in the catalog, which will be properly understood by ucol_open(). Because this fix affects the resulting collator, we cannot change the locale string stored in the catalog for existing databases or collations; otherwise we'd risk corrupting indexes. Therefore, only canonicalize locales for newly-created (not upgraded) collations/databases. For similar reasons, do not backport. Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com Reviewed-by: Peter Eisentraut
2 years ago
extern char *icu_language_tag(const char *loc_str, int elevel);
extern void report_newlocale_failure(const char *localename);
/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
locale_t loc);
extern size_t char2wchar(wchar_t *to, size_t tolen,
const char *from, size_t fromlen, locale_t loc);
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
#endif /* _PG_LOCALE_ */