|
|
|
/*-----------------------------------------------------------------------
|
|
|
|
*
|
|
|
|
* PostgreSQL locale utilities for builtin provider
|
|
|
|
*
|
|
|
|
* Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
|
|
|
|
*
|
|
|
|
* src/backend/utils/adt/pg_locale_builtin.c
|
|
|
|
*
|
|
|
|
*-----------------------------------------------------------------------
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include "catalog/pg_database.h"
|
|
|
|
#include "catalog/pg_collation.h"
|
|
|
|
#include "common/unicode_case.h"
|
|
|
|
#include "common/unicode_category.h"
|
|
|
|
#include "mb/pg_wchar.h"
|
|
|
|
#include "miscadmin.h"
|
|
|
|
#include "utils/builtins.h"
|
|
|
|
#include "utils/memutils.h"
|
|
|
|
#include "utils/pg_locale.h"
|
|
|
|
#include "utils/syscache.h"
|
|
|
|
|
|
|
|
extern pg_locale_t create_pg_locale_builtin(Oid collid,
|
|
|
|
MemoryContext context);
|
|
|
|
extern char *get_collation_actual_version_builtin(const char *collcollate);
|
|
|
|
extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src,
|
|
|
|
ssize_t srclen, pg_locale_t locale);
|
|
|
|
extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
|
|
|
|
ssize_t srclen, pg_locale_t locale);
|
|
|
|
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
|
|
|
|
ssize_t srclen, pg_locale_t locale);
|
|
|
|
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
|
|
|
|
ssize_t srclen, pg_locale_t locale);
|
|
|
|
|
|
|
|
|
|
|
|
struct WordBoundaryState
|
|
|
|
{
|
|
|
|
const char *str;
|
|
|
|
size_t len;
|
|
|
|
size_t offset;
|
|
|
|
bool init;
|
|
|
|
bool prev_alnum;
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Simple word boundary iterator that draws boundaries each time the result of
|
|
|
|
* pg_u_isalnum() changes.
|
|
|
|
*/
|
|
|
|
static size_t
|
|
|
|
initcap_wbnext(void *state)
|
|
|
|
{
|
|
|
|
struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
|
|
|
|
|
|
|
|
while (wbstate->offset < wbstate->len &&
|
|
|
|
wbstate->str[wbstate->offset] != '\0')
|
|
|
|
{
|
|
|
|
pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
|
|
|
|
wbstate->offset);
|
|
|
|
bool curr_alnum = pg_u_isalnum(u, true);
|
|
|
|
|
|
|
|
if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
|
|
|
|
{
|
|
|
|
size_t prev_offset = wbstate->offset;
|
|
|
|
|
|
|
|
wbstate->init = true;
|
|
|
|
wbstate->offset += unicode_utf8len(u);
|
|
|
|
wbstate->prev_alnum = curr_alnum;
|
|
|
|
return prev_offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
wbstate->offset += unicode_utf8len(u);
|
|
|
|
}
|
|
|
|
|
|
|
|
return wbstate->len;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t
|
|
|
|
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale)
|
|
|
|
{
|
Support PG_UNICODE_FAST locale in the builtin collation provider.
The PG_UNICODE_FAST locale uses code point sort order (fast,
memcmp-based) combined with Unicode character semantics. The character
semantics are based on Unicode full case mapping.
Full case mapping can map a single codepoint to multiple codepoints,
such as "ß" uppercasing to "SS". Additionally, it handles
context-sensitive mappings like the "final sigma", and it uses
titlecase mappings such as "Dž" when titlecasing (rather than plain
uppercase mappings).
Importantly, the uppercasing of "ß" as "SS" is specifically mentioned
by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics
for case mapping and pattern matching, so if we changed it to use the
PG_UNICODE_FAST locale, it would offer better compliance with the
standard. For now, though, do not change the behavior of UCS_BASIC.
Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
5 months ago
|
|
|
return unicode_strlower(dest, destsize, src, srclen,
|
|
|
|
locale->info.builtin.casemap_full);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t
|
|
|
|
strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale)
|
|
|
|
{
|
|
|
|
struct WordBoundaryState wbstate = {
|
|
|
|
.str = src,
|
|
|
|
.len = srclen,
|
|
|
|
.offset = 0,
|
|
|
|
.init = false,
|
|
|
|
.prev_alnum = false,
|
|
|
|
};
|
|
|
|
|
Support PG_UNICODE_FAST locale in the builtin collation provider.
The PG_UNICODE_FAST locale uses code point sort order (fast,
memcmp-based) combined with Unicode character semantics. The character
semantics are based on Unicode full case mapping.
Full case mapping can map a single codepoint to multiple codepoints,
such as "ß" uppercasing to "SS". Additionally, it handles
context-sensitive mappings like the "final sigma", and it uses
titlecase mappings such as "Dž" when titlecasing (rather than plain
uppercase mappings).
Importantly, the uppercasing of "ß" as "SS" is specifically mentioned
by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics
for case mapping and pattern matching, so if we changed it to use the
PG_UNICODE_FAST locale, it would offer better compliance with the
standard. For now, though, do not change the behavior of UCS_BASIC.
Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
5 months ago
|
|
|
return unicode_strtitle(dest, destsize, src, srclen,
|
|
|
|
locale->info.builtin.casemap_full,
|
|
|
|
initcap_wbnext, &wbstate);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t
|
|
|
|
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale)
|
|
|
|
{
|
Support PG_UNICODE_FAST locale in the builtin collation provider.
The PG_UNICODE_FAST locale uses code point sort order (fast,
memcmp-based) combined with Unicode character semantics. The character
semantics are based on Unicode full case mapping.
Full case mapping can map a single codepoint to multiple codepoints,
such as "ß" uppercasing to "SS". Additionally, it handles
context-sensitive mappings like the "final sigma", and it uses
titlecase mappings such as "Dž" when titlecasing (rather than plain
uppercase mappings).
Importantly, the uppercasing of "ß" as "SS" is specifically mentioned
by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics
for case mapping and pattern matching, so if we changed it to use the
PG_UNICODE_FAST locale, it would offer better compliance with the
standard. For now, though, do not change the behavior of UCS_BASIC.
Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
5 months ago
|
|
|
return unicode_strupper(dest, destsize, src, srclen,
|
|
|
|
locale->info.builtin.casemap_full);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t
|
|
|
|
strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
|
|
|
|
pg_locale_t locale)
|
|
|
|
{
|
|
|
|
return unicode_strfold(dest, destsize, src, srclen,
|
|
|
|
locale->info.builtin.casemap_full);
|
|
|
|
}
|
|
|
|
|
|
|
|
pg_locale_t
|
|
|
|
create_pg_locale_builtin(Oid collid, MemoryContext context)
|
|
|
|
{
|
|
|
|
const char *locstr;
|
|
|
|
pg_locale_t result;
|
|
|
|
|
|
|
|
if (collid == DEFAULT_COLLATION_OID)
|
|
|
|
{
|
|
|
|
HeapTuple tp;
|
|
|
|
Datum datum;
|
|
|
|
|
|
|
|
tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
|
|
|
|
if (!HeapTupleIsValid(tp))
|
|
|
|
elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
|
|
|
|
datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
|
|
|
|
Anum_pg_database_datlocale);
|
|
|
|
locstr = TextDatumGetCString(datum);
|
|
|
|
ReleaseSysCache(tp);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
HeapTuple tp;
|
|
|
|
Datum datum;
|
|
|
|
|
|
|
|
tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
|
|
|
|
if (!HeapTupleIsValid(tp))
|
|
|
|
elog(ERROR, "cache lookup failed for collation %u", collid);
|
|
|
|
datum = SysCacheGetAttrNotNull(COLLOID, tp,
|
|
|
|
Anum_pg_collation_colllocale);
|
|
|
|
locstr = TextDatumGetCString(datum);
|
|
|
|
ReleaseSysCache(tp);
|
|
|
|
}
|
|
|
|
|
|
|
|
builtin_validate_locale(GetDatabaseEncoding(), locstr);
|
|
|
|
|
|
|
|
result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
|
|
|
|
|
|
|
|
result->info.builtin.locale = MemoryContextStrdup(context, locstr);
|
Support PG_UNICODE_FAST locale in the builtin collation provider.
The PG_UNICODE_FAST locale uses code point sort order (fast,
memcmp-based) combined with Unicode character semantics. The character
semantics are based on Unicode full case mapping.
Full case mapping can map a single codepoint to multiple codepoints,
such as "ß" uppercasing to "SS". Additionally, it handles
context-sensitive mappings like the "final sigma", and it uses
titlecase mappings such as "Dž" when titlecasing (rather than plain
uppercase mappings).
Importantly, the uppercasing of "ß" as "SS" is specifically mentioned
by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics
for case mapping and pattern matching, so if we changed it to use the
PG_UNICODE_FAST locale, it would offer better compliance with the
standard. For now, though, do not change the behavior of UCS_BASIC.
Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
5 months ago
|
|
|
result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
|
|
|
|
result->provider = COLLPROVIDER_BUILTIN;
|
|
|
|
result->deterministic = true;
|
|
|
|
result->collate_is_c = true;
|
|
|
|
result->ctype_is_c = (strcmp(locstr, "C") == 0);
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
char *
|
|
|
|
get_collation_actual_version_builtin(const char *collcollate)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The only two supported locales (C and C.UTF-8) are both based on memcmp
|
|
|
|
* and are not expected to change, but track the version anyway.
|
|
|
|
*
|
|
|
|
* Note that the character semantics may change for some locales, but the
|
|
|
|
* collation version only tracks changes to sort order.
|
|
|
|
*/
|
|
|
|
if (strcmp(collcollate, "C") == 0)
|
|
|
|
return "1";
|
|
|
|
else if (strcmp(collcollate, "C.UTF-8") == 0)
|
|
|
|
return "1";
|
Support PG_UNICODE_FAST locale in the builtin collation provider.
The PG_UNICODE_FAST locale uses code point sort order (fast,
memcmp-based) combined with Unicode character semantics. The character
semantics are based on Unicode full case mapping.
Full case mapping can map a single codepoint to multiple codepoints,
such as "ß" uppercasing to "SS". Additionally, it handles
context-sensitive mappings like the "final sigma", and it uses
titlecase mappings such as "Dž" when titlecasing (rather than plain
uppercase mappings).
Importantly, the uppercasing of "ß" as "SS" is specifically mentioned
by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics
for case mapping and pattern matching, so if we changed it to use the
PG_UNICODE_FAST locale, it would offer better compliance with the
standard. For now, though, do not change the behavior of UCS_BASIC.
Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com
Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org
Reviewed-by: Peter Eisentraut, Daniel Verite
5 months ago
|
|
|
else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
|
|
|
|
return "1";
|
|
|
|
else
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
|
|
|
|
errmsg("invalid locale name \"%s\" for builtin provider",
|
|
|
|
collcollate)));
|
|
|
|
|
|
|
|
return NULL; /* keep compiler quiet */
|
|
|
|
}
|