|
|
|
/*-----------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* PostgreSQL locale utilities
|
|
|
|
*
|
|
|
|
* src/include/utils/pg_locale.h
|
|
|
|
*
|
|
|
|
* Copyright (c) 2002-2025, PostgreSQL Global Development Group
|
|
|
|
*
|
|
|
|
*-----------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _PG_LOCALE_
|
|
|
|
#define _PG_LOCALE_
|
|
|
|
|
|
|
|
#include "mb/pg_wchar.h"
|
|
|
|
|
|
|
|
#ifdef USE_ICU
|
|
|
|
/* only include the C APIs, to avoid errors in cpluspluscheck */
|
|
|
|
#undef U_SHOW_CPLUSPLUS_API
|
|
|
|
#define U_SHOW_CPLUSPLUS_API 0
|
|
|
|
#include <unicode/ucol.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* use for libc locale names */
|
|
|
|
#define LOCALE_NAME_BUFLEN 128
|
|
|
|
|
|
|
|
/* GUC settings */
|
|
|
|
extern PGDLLIMPORT char *locale_messages;
|
|
|
|
extern PGDLLIMPORT char *locale_monetary;
|
|
|
|
extern PGDLLIMPORT char *locale_numeric;
|
|
|
|
extern PGDLLIMPORT char *locale_time;
|
|
|
|
extern PGDLLIMPORT int icu_validation_level;
|
|
|
|
|
|
|
|
/* lc_time localization cache */
|
|
|
|
extern PGDLLIMPORT char *localized_abbrev_days[];
|
|
|
|
extern PGDLLIMPORT char *localized_full_days[];
|
|
|
|
extern PGDLLIMPORT char *localized_abbrev_months[];
|
|
|
|
extern PGDLLIMPORT char *localized_full_months[];
|
|
|
|
|
|
|
|
/* is the databases's LC_CTYPE the C locale? */
|
|
|
|
extern PGDLLIMPORT bool database_ctype_is_c;
|
|
|
|
|
Replace empty locale name with implied value in CREATE DATABASE and initdb.
setlocale() accepts locale name "" as meaning "the locale specified by the
process's environment variables". Historically we've accepted that for
Postgres' locale settings, too. However, it's fairly unsafe to store an
empty string in a new database's pg_database.datcollate or datctype fields,
because then the interpretation could vary across postmaster restarts,
possibly resulting in index corruption and other unpleasantness.
Instead, we should expand "" to whatever it means at the moment of calling
CREATE DATABASE, which we can do by saving the value returned by
setlocale().
For consistency, make initdb set up the initial lc_xxx parameter values the
same way. initdb was already doing the right thing for empty locale names,
but it did not replace non-empty names with setlocale results. On a
platform where setlocale chooses to canonicalize the spellings of locale
names, this would result in annoying inconsistency. (It seems that popular
implementations of setlocale don't do such canonicalization, which is a
pity, but the POSIX spec certainly allows it to be done.) The same risk
of inconsistency leads me to not venture back-patching this, although it
could certainly be seen as a longstanding bug.
Per report from Jeff Davis, though this is not his proposed patch.
14 years ago
|
|
|
extern bool check_locale(int category, const char *locale, char **canonname);
|
|
|
|
extern char *pg_perm_setlocale(int category, const char *locale);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the POSIX lconv struct (contains number/money formatting
|
|
|
|
* information) with locale information for all categories.
|
|
|
|
*/
|
|
|
|
extern struct lconv *PGLC_localeconv(void);
|
|
|
|
|
|
|
|
extern void cache_locale_time(void);
|
|
|
|
|
|
|
|
|
|
|
|
struct pg_locale_struct;
|
|
|
|
typedef struct pg_locale_struct *pg_locale_t;
|
|
|
|
|
|
|
|
/* methods that define collation behavior */
|
|
|
|
struct collate_methods
|
|
|
|
{
|
|
|
|
/* required */
|
|
|
|
int (*strncoll) (const char *arg1, ssize_t len1,
|
|
|
|
const char *arg2, ssize_t len2,
|
|
|
|
pg_locale_t locale);
|
|
|
|
|
|
|
|
/* required */
|
|
|
|
size_t (*strnxfrm) (char *dest, size_t destsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
|
|
|
|
/* optional */
|
|
|
|
size_t (*strnxfrm_prefix) (char *dest, size_t destsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the strnxfrm method is not trusted to return the correct results,
|
|
|
|
* set strxfrm_is_safe to false. It set to false, the method will not be
|
|
|
|
* used in most cases, but the planner still expects it to be there for
|
|
|
|
* estimation purposes (where incorrect results are acceptable).
|
|
|
|
*/
|
|
|
|
bool strxfrm_is_safe;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ctype_methods
|
|
|
|
{
|
|
|
|
/* case mapping: LOWER()/INITCAP()/UPPER() */
|
|
|
|
size_t (*strlower) (char *dest, size_t destsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
size_t (*strtitle) (char *dest, size_t destsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
size_t (*strupper) (char *dest, size_t destsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
size_t (*strfold) (char *dest, size_t destsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
|
|
|
|
/* required */
|
|
|
|
bool (*wc_isdigit) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
bool (*wc_isalpha) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
bool (*wc_isalnum) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
bool (*wc_isupper) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
bool (*wc_islower) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
bool (*wc_isgraph) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
bool (*wc_isprint) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
bool (*wc_ispunct) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
bool (*wc_isspace) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale);
|
|
|
|
|
|
|
|
/* required */
|
|
|
|
bool (*char_is_cased) (char ch, pg_locale_t locale);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Optional. If defined, will only be called for single-byte encodings. If
|
|
|
|
* not defined, or if the encoding is multibyte, will fall back to
|
|
|
|
* pg_strlower().
|
|
|
|
*/
|
|
|
|
char (*char_tolower) (unsigned char ch, pg_locale_t locale);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* For regex and pattern matching efficiency, the maximum char value
|
|
|
|
* supported by the above methods. If zero, limit is set by regex code.
|
|
|
|
*/
|
|
|
|
pg_wchar max_chr;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We use a discriminated union to hold either a locale_t or an ICU collator.
|
|
|
|
* pg_locale_t is occasionally checked for truth, so make it a pointer.
|
|
|
|
*
|
|
|
|
* Also, hold two flags: whether the collation's LC_COLLATE or LC_CTYPE is C
|
|
|
|
* (or POSIX), so we can optimize a few code paths in various places. For the
|
|
|
|
* built-in C and POSIX collations, we can know that without even doing a
|
|
|
|
* cache lookup, but we want to support aliases for C/POSIX too. For the
|
|
|
|
* "default" collation, there are separate static cache variables, since
|
|
|
|
* consulting the pg_collation catalog doesn't tell us what we need.
|
|
|
|
*
|
|
|
|
* Note that some code relies on the flags not reporting false negatives
|
|
|
|
* (that is, saying it's not C when it is). For example, char2wchar()
|
|
|
|
* could fail if the locale is C, so str_tolower() shouldn't call it
|
|
|
|
* in that case.
|
|
|
|
*/
|
|
|
|
struct pg_locale_struct
|
|
|
|
{
|
|
|
|
bool deterministic;
|
|
|
|
bool collate_is_c;
|
|
|
|
bool ctype_is_c;
|
|
|
|
bool is_default;
|
|
|
|
|
|
|
|
const struct collate_methods *collate; /* NULL if collate_is_c */
|
|
|
|
const struct ctype_methods *ctype; /* NULL if ctype_is_c */
|
|
|
|
|
|
|
|
union
|
|
|
|
{
|
Introduce "builtin" collation provider.
New provider for collations, like "libc" or "icu", but without any
external dependency.
Initially, the only locale supported by the builtin provider is "C",
which is identical to the libc provider's "C" locale. The libc
provider's "C" locale has always been treated as a special case that
uses an internal implementation, without using libc at all -- so the
new builtin provider uses the same implementation.
The builtin provider's locale is independent of the server environment
variables LC_COLLATE and LC_CTYPE. Using the builtin provider, the
database collation locale can be "C" while LC_COLLATE and LC_CTYPE are
set to "en_US", which is impossible with the libc provider.
By offering a new builtin provider, it clarifies that the semantics of
a collation using this provider will never depend on libc, and makes
it easier to document the behavior.
Discussion: https://postgr.es/m/ab925f69-5f9d-f85e-b87c-bd2a44798659@joeconway.com
Discussion: https://postgr.es/m/dd9261f4-7a98-4565-93ec-336c1c110d90@manitou-mail.org
Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
2 years ago
|
|
|
struct
|
|
|
|
{
|
|
|
|
const char *locale;
|
Support PG_UNICODE_FAST locale in the builtin collation provider.
The PG_UNICODE_FAST locale uses code point sort order (fast,
memcmp-based) combined with Unicode character semantics. The character
semantics are based on Unicode full case mapping.
Full case mapping can map a single codepoint to multiple codepoints,
such as "ß" uppercasing to "SS". Additionally, it handles
context-sensitive mappings like the "final sigma", and it uses
titlecase mappings such as "Dž" when titlecasing (rather than plain
uppercase mappings).
Importantly, the uppercasing of "ß" as "SS" is specifically mentioned
by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics
for case mapping and pattern matching, so if we changed it to use the
PG_UNICODE_FAST locale, it would offer better compliance with the
standard. For now, though, do not change the behavior of UCS_BASIC.
Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
8 months ago
|
|
|
bool casemap_full;
|
Introduce "builtin" collation provider.
New provider for collations, like "libc" or "icu", but without any
external dependency.
Initially, the only locale supported by the builtin provider is "C",
which is identical to the libc provider's "C" locale. The libc
provider's "C" locale has always been treated as a special case that
uses an internal implementation, without using libc at all -- so the
new builtin provider uses the same implementation.
The builtin provider's locale is independent of the server environment
variables LC_COLLATE and LC_CTYPE. Using the builtin provider, the
database collation locale can be "C" while LC_COLLATE and LC_CTYPE are
set to "en_US", which is impossible with the libc provider.
By offering a new builtin provider, it clarifies that the semantics of
a collation using this provider will never depend on libc, and makes
it easier to document the behavior.
Discussion: https://postgr.es/m/ab925f69-5f9d-f85e-b87c-bd2a44798659@joeconway.com
Discussion: https://postgr.es/m/dd9261f4-7a98-4565-93ec-336c1c110d90@manitou-mail.org
Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com
Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneider
2 years ago
|
|
|
} builtin;
|
|
|
|
locale_t lt;
|
|
|
|
#ifdef USE_ICU
|
|
|
|
struct
|
|
|
|
{
|
|
|
|
const char *locale;
|
|
|
|
UCollator *ucol;
|
|
|
|
} icu;
|
|
|
|
#endif
|
|
|
|
} info;
|
|
|
|
};
|
|
|
|
|
|
|
|
extern void init_database_collation(void);
|
|
|
|
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
|
|
|
|
|
|
|
|
extern char *get_collation_actual_version(char collprovider, const char *collcollate);
|
|
|
|
|
|
|
|
extern bool char_is_cased(char ch, pg_locale_t locale);
|
|
|
|
extern bool char_tolower_enabled(pg_locale_t locale);
|
|
|
|
extern char char_tolower(unsigned char ch, pg_locale_t locale);
|
|
|
|
extern size_t pg_strlower(char *dst, size_t dstsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
extern size_t pg_strtitle(char *dst, size_t dstsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
extern size_t pg_strupper(char *dst, size_t dstsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
extern size_t pg_strfold(char *dst, size_t dstsize,
|
|
|
|
const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale);
|
|
|
|
extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
|
|
|
|
extern int pg_strncoll(const char *arg1, ssize_t len1,
|
|
|
|
const char *arg2, ssize_t len2, pg_locale_t locale);
|
|
|
|
extern bool pg_strxfrm_enabled(pg_locale_t locale);
|
|
|
|
extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize,
|
|
|
|
pg_locale_t locale);
|
|
|
|
extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src,
|
|
|
|
ssize_t srclen, pg_locale_t locale);
|
|
|
|
extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale);
|
|
|
|
extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
|
|
|
|
pg_locale_t locale);
|
|
|
|
extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
|
|
|
|
ssize_t srclen, pg_locale_t locale);
|
|
|
|
|
|
|
|
extern int builtin_locale_encoding(const char *locale);
|
|
|
|
extern const char *builtin_validate_locale(int encoding, const char *locale);
|
|
|
|
extern void icu_validate_locale(const char *loc_str);
|
Canonicalize ICU locale names to language tags.
Convert to BCP47 language tags before storing in the catalog, except
during binary upgrade or when the locale comes from an existing
collation or template database.
The resulting language tags can vary slightly between ICU
versions. For instance, "@colBackwards=yes" is converted to
"und-u-kb-true" in older versions of ICU, and to the simpler (but
equivalent) "und-u-kb" in newer versions.
The process of canonicalizing to a language tag also understands more
input locale string formats than ucol_open(). For instance,
"fr_CA.UTF-8" is misinterpreted by ucol_open() and the region is
ignored; effectively treating it the same as the locale "fr" and
opening the wrong collator. Canonicalization properly interprets the
language and region, resulting in the language tag "fr-CA", which can
then be understood by ucol_open().
This commit fixes a problem in prior versions due to ucol_open()
misinterpreting locale strings as described above. For instance,
creating an ICU collation with locale "fr_CA.UTF-8" would store that
string directly in the catalog, which would later be passed to (and
misinterpreted by) ucol_open(). After this commit, the locale string
will be canonicalized to language tag "fr-CA" in the catalog, which
will be properly understood by ucol_open(). Because this fix affects
the resulting collator, we cannot change the locale string stored in
the catalog for existing databases or collations; otherwise we'd risk
corrupting indexes. Therefore, only canonicalize locales for
newly-created (not upgraded) collations/databases. For similar
reasons, do not backport.
Discussion: https://postgr.es/m/8c7af6820aed94dc7bc259d2aa7f9663518e6137.camel@j-davis.com
Reviewed-by: Peter Eisentraut
2 years ago
|
|
|
extern char *icu_language_tag(const char *loc_str, int elevel);
|
|
|
|
extern void report_newlocale_failure(const char *localename);
|
|
|
|
|
|
|
|
/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
|
|
|
|
extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
|
|
|
|
locale_t loc);
|
|
|
|
extern size_t char2wchar(wchar_t *to, size_t tolen,
|
|
|
|
const char *from, size_t fromlen, locale_t loc);
|
|
|
|
|
Phase 2 of pgindent updates.
Change pg_bsd_indent to follow upstream rules for placement of comments
to the right of code, and remove pgindent hack that caused comments
following #endif to not obey the general rule.
Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using
the published version of pg_bsd_indent, but a hacked-up version that
tried to minimize the amount of movement of comments to the right of
code. The situation of interest is where such a comment has to be
moved to the right of its default placement at column 33 because there's
code there. BSD indent has always moved right in units of tab stops
in such cases --- but in the previous incarnation, indent was working
in 8-space tab stops, while now it knows we use 4-space tabs. So the
net result is that in about half the cases, such comments are placed
one tab stop left of before. This is better all around: it leaves
more room on the line for comment text, and it means that in such
cases the comment uniformly starts at the next 4-space tab stop after
the code, rather than sometimes one and sometimes two tabs after.
Also, ensure that comments following #endif are indented the same
as comments following other preprocessor commands such as #else.
That inconsistency turns out to have been self-inflicted damage
from a poorly-thought-through post-indent "fixup" in pgindent.
This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.
Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
|
|
|
#endif /* _PG_LOCALE_ */
|