postgres/src/backend/utils/adt/pg_locale_builtin.c

/*-----------------------------------------------------------------------
 *
 * PostgreSQL locale utilities for builtin provider
 *
 * Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group
 *
 * src/backend/utils/adt/pg_locale_builtin.c
 *
 *-----------------------------------------------------------------------
 */

#include "postgres.h"

#include "catalog/pg_database.h"
#include "catalog/pg_collation.h"
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/pg_locale.h"
#include "utils/syscache.h"

extern pg_locale_t create_pg_locale_builtin(Oid collid,
											MemoryContext context);
extern char *get_collation_actual_version_builtin(const char *collcollate);
extern size_t strlower_builtin(char *dst, size_t dstsize, const char *src,
							   ssize_t srclen, pg_locale_t locale);
extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
							   ssize_t srclen, pg_locale_t locale);
extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
							   ssize_t srclen, pg_locale_t locale);
extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
							  ssize_t srclen, pg_locale_t locale);


struct WordBoundaryState
{
	const char *str;
	size_t		len;
	size_t		offset;
	bool		init;
	bool		prev_alnum;
};

/*
 * Simple word boundary iterator that draws boundaries each time the result of
 * pg_u_isalnum() changes.
 */
static size_t
initcap_wbnext(void *state)
{
	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;

	while (wbstate->offset < wbstate->len &&
		   wbstate->str[wbstate->offset] != '\0')
	{
		pg_wchar	u = utf8_to_unicode((unsigned char *) wbstate->str +
										wbstate->offset);
		bool		curr_alnum = pg_u_isalnum(u, true);

		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
		{
			size_t		prev_offset = wbstate->offset;

			wbstate->init = true;
			wbstate->offset += unicode_utf8len(u);
			wbstate->prev_alnum = curr_alnum;
			return prev_offset;
		}

		wbstate->offset += unicode_utf8len(u);
	}

	return wbstate->len;
}

size_t
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
				 pg_locale_t locale)
{
	return unicode_strlower(dest, destsize, src, srclen,
							locale->info.builtin.casemap_full);
}

size_t
strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
				 pg_locale_t locale)
{
	struct WordBoundaryState wbstate = {
		.str = src,
		.len = srclen,
		.offset = 0,
		.init = false,
		.prev_alnum = false,
	};

	return unicode_strtitle(dest, destsize, src, srclen,
							locale->info.builtin.casemap_full,
							initcap_wbnext, &wbstate);
}

size_t
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
				 pg_locale_t locale)
{
	return unicode_strupper(dest, destsize, src, srclen,
							locale->info.builtin.casemap_full);
}

size_t
strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
				pg_locale_t locale)
{
	return unicode_strfold(dest, destsize, src, srclen,
						   locale->info.builtin.casemap_full);
}

pg_locale_t
create_pg_locale_builtin(Oid collid, MemoryContext context)
{
	const char *locstr;
	pg_locale_t result;

	if (collid == DEFAULT_COLLATION_OID)
	{
		HeapTuple	tp;
		Datum		datum;

		tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
		if (!HeapTupleIsValid(tp))
			elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
		datum = SysCacheGetAttrNotNull(DATABASEOID, tp,
									   Anum_pg_database_datlocale);
		locstr = TextDatumGetCString(datum);
		ReleaseSysCache(tp);
	}
	else
	{
		HeapTuple	tp;
		Datum		datum;

		tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
		if (!HeapTupleIsValid(tp))
			elog(ERROR, "cache lookup failed for collation %u", collid);
		datum = SysCacheGetAttrNotNull(COLLOID, tp,
									   Anum_pg_collation_colllocale);
		locstr = TextDatumGetCString(datum);
		ReleaseSysCache(tp);
	}

	builtin_validate_locale(GetDatabaseEncoding(), locstr);

	result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));

	result->info.builtin.locale = MemoryContextStrdup(context, locstr);
	result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
	result->provider = COLLPROVIDER_BUILTIN;
	result->deterministic = true;
	result->collate_is_c = true;
	result->ctype_is_c = (strcmp(locstr, "C") == 0);

	return result;
}

char *
get_collation_actual_version_builtin(const char *collcollate)
{
	/*
	 * The only two supported locales (C and C.UTF-8) are both based on memcmp
	 * and are not expected to change, but track the version anyway.
	 *
	 * Note that the character semantics may change for some locales, but the
	 * collation version only tracks changes to sort order.
	 */
	if (strcmp(collcollate, "C") == 0)
		return "1";
	else if (strcmp(collcollate, "C.UTF-8") == 0)
		return "1";
	else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
		return "1";
	else
		ereport(ERROR,
				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
				 errmsg("invalid locale name \"%s\" for builtin provider",
						collcollate)));

	return NULL;				/* keep compiler quiet */
}
Perform provider-specific initialization in new functions. Reviewed-by: Andreas Karlsson Discussion: https://postgr.es/m/4548a168-62cd-457b-8d06-9ba7b985c477@proxel.se 7 months ago			`/*-----------------------------------------------------------------------`
			`*`
			`* PostgreSQL locale utilities for builtin provider`
			`*`
Update copyright for 2025 Backpatch-through: 13 6 months ago			`* Portions Copyright (c) 2002-2025, PostgreSQL Global Development Group`
Perform provider-specific initialization in new functions. Reviewed-by: Andreas Karlsson Discussion: https://postgr.es/m/4548a168-62cd-457b-8d06-9ba7b985c477@proxel.se 7 months ago			`*`
			`* src/backend/utils/adt/pg_locale_builtin.c`
			`*`
			`*-----------------------------------------------------------------------`
			`*/`

			`#include "postgres.h"`

			`#include "catalog/pg_database.h"`
			`#include "catalog/pg_collation.h"`
Refactor string case conversion into provider-specific files. Create API entry points pg_strlower(), etc., that work with any provider and give the caller control over the destination buffer. Then, move provider-specific logic into pg_locale_builtin.c, pg_locale_icu.c, and pg_locale_libc.c as appropriate. Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com 7 months ago			`#include "common/unicode_case.h"`
			`#include "common/unicode_category.h"`
Perform provider-specific initialization in new functions. Reviewed-by: Andreas Karlsson Discussion: https://postgr.es/m/4548a168-62cd-457b-8d06-9ba7b985c477@proxel.se 7 months ago			`#include "mb/pg_wchar.h"`
			`#include "miscadmin.h"`
			`#include "utils/builtins.h"`
			`#include "utils/memutils.h"`
			`#include "utils/pg_locale.h"`
			`#include "utils/syscache.h"`

			`extern pg_locale_t create_pg_locale_builtin(Oid collid,`
			`MemoryContext context);`
Move code for collation version into provider-specific files. Author: Andreas Karlsson Discussion: https://postgr.es/m/4548a168-62cd-457b-8d06-9ba7b985c477%40proxel.se 6 months ago			`extern char get_collation_actual_version_builtin(const char collcollate);`
Refactor string case conversion into provider-specific files. Create API entry points pg_strlower(), etc., that work with any provider and give the caller control over the destination buffer. Then, move provider-specific logic into pg_locale_builtin.c, pg_locale_icu.c, and pg_locale_libc.c as appropriate. Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com 7 months ago			`extern size_t strlower_builtin(char dst, size_t dstsize, const char src,`
			`ssize_t srclen, pg_locale_t locale);`
			`extern size_t strtitle_builtin(char dst, size_t dstsize, const char src,`
			`ssize_t srclen, pg_locale_t locale);`
			`extern size_t strupper_builtin(char dst, size_t dstsize, const char src,`
			`ssize_t srclen, pg_locale_t locale);`
Add SQL function CASEFOLD(). Useful for caseless matching. Similar to LOWER(), but avoids edge-case problems with using LOWER() for caseless matching. For collations that support it, CASEFOLD() handles characters with more than two case variations or multi-character case variations. Some characters may fold to uppercase. The results of case folding are also more stable across Unicode versions than LOWER() or UPPER(). Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com Reviewed-by: Ian Lawrence Barwick 5 months ago			`extern size_t strfold_builtin(char dst, size_t dstsize, const char src,`
			`ssize_t srclen, pg_locale_t locale);`
Refactor string case conversion into provider-specific files. Create API entry points pg_strlower(), etc., that work with any provider and give the caller control over the destination buffer. Then, move provider-specific logic into pg_locale_builtin.c, pg_locale_icu.c, and pg_locale_libc.c as appropriate. Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com 7 months ago

			`struct WordBoundaryState`
			`{`
			`const char *str;`
			`size_t len;`
			`size_t offset;`
			`bool init;`
			`bool prev_alnum;`
			`};`

			`/*`
			`* Simple word boundary iterator that draws boundaries each time the result of`
			`* pg_u_isalnum() changes.`
			`*/`
			`static size_t`
			`initcap_wbnext(void *state)`
			`{`
			`struct WordBoundaryState wbstate = (struct WordBoundaryState ) state;`

			`while (wbstate->offset < wbstate->len &&`
			`wbstate->str[wbstate->offset] != '\0')`
			`{`
			`pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +`
			`wbstate->offset);`
			`bool curr_alnum = pg_u_isalnum(u, true);`

			`if (!wbstate->init \|\| curr_alnum != wbstate->prev_alnum)`
			`{`
			`size_t prev_offset = wbstate->offset;`

			`wbstate->init = true;`
			`wbstate->offset += unicode_utf8len(u);`
			`wbstate->prev_alnum = curr_alnum;`
			`return prev_offset;`
			`}`

			`wbstate->offset += unicode_utf8len(u);`
			`}`

			`return wbstate->len;`
			`}`

			`size_t`
			`strlower_builtin(char dest, size_t destsize, const char src, ssize_t srclen,`
			`pg_locale_t locale)`
			`{`
Support PG_UNICODE_FAST locale in the builtin collation provider. The PG_UNICODE_FAST locale uses code point sort order (fast, memcmp-based) combined with Unicode character semantics. The character semantics are based on Unicode full case mapping. Full case mapping can map a single codepoint to multiple codepoints, such as "ß" uppercasing to "SS". Additionally, it handles context-sensitive mappings like the "final sigma", and it uses titlecase mappings such as "ǅ" when titlecasing (rather than plain uppercase mappings). Importantly, the uppercasing of "ß" as "SS" is specifically mentioned by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics for case mapping and pattern matching, so if we changed it to use the PG_UNICODE_FAST locale, it would offer better compliance with the standard. For now, though, do not change the behavior of UCS_BASIC. Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite 5 months ago			`return unicode_strlower(dest, destsize, src, srclen,`
			`locale->info.builtin.casemap_full);`
Refactor string case conversion into provider-specific files. Create API entry points pg_strlower(), etc., that work with any provider and give the caller control over the destination buffer. Then, move provider-specific logic into pg_locale_builtin.c, pg_locale_icu.c, and pg_locale_libc.c as appropriate. Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com 7 months ago			`}`

			`size_t`
			`strtitle_builtin(char dest, size_t destsize, const char src, ssize_t srclen,`
			`pg_locale_t locale)`
			`{`
			`struct WordBoundaryState wbstate = {`
			`.str = src,`
			`.len = srclen,`
			`.offset = 0,`
			`.init = false,`
			`.prev_alnum = false,`
			`};`

Support PG_UNICODE_FAST locale in the builtin collation provider. The PG_UNICODE_FAST locale uses code point sort order (fast, memcmp-based) combined with Unicode character semantics. The character semantics are based on Unicode full case mapping. Full case mapping can map a single codepoint to multiple codepoints, such as "ß" uppercasing to "SS". Additionally, it handles context-sensitive mappings like the "final sigma", and it uses titlecase mappings such as "ǅ" when titlecasing (rather than plain uppercase mappings). Importantly, the uppercasing of "ß" as "SS" is specifically mentioned by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics for case mapping and pattern matching, so if we changed it to use the PG_UNICODE_FAST locale, it would offer better compliance with the standard. For now, though, do not change the behavior of UCS_BASIC. Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite 5 months ago			`return unicode_strtitle(dest, destsize, src, srclen,`
			`locale->info.builtin.casemap_full,`
Refactor string case conversion into provider-specific files. Create API entry points pg_strlower(), etc., that work with any provider and give the caller control over the destination buffer. Then, move provider-specific logic into pg_locale_builtin.c, pg_locale_icu.c, and pg_locale_libc.c as appropriate. Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com 7 months ago			`initcap_wbnext, &wbstate);`
			`}`

			`size_t`
			`strupper_builtin(char dest, size_t destsize, const char src, ssize_t srclen,`
			`pg_locale_t locale)`
			`{`
Support PG_UNICODE_FAST locale in the builtin collation provider. The PG_UNICODE_FAST locale uses code point sort order (fast, memcmp-based) combined with Unicode character semantics. The character semantics are based on Unicode full case mapping. Full case mapping can map a single codepoint to multiple codepoints, such as "ß" uppercasing to "SS". Additionally, it handles context-sensitive mappings like the "final sigma", and it uses titlecase mappings such as "ǅ" when titlecasing (rather than plain uppercase mappings). Importantly, the uppercasing of "ß" as "SS" is specifically mentioned by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics for case mapping and pattern matching, so if we changed it to use the PG_UNICODE_FAST locale, it would offer better compliance with the standard. For now, though, do not change the behavior of UCS_BASIC. Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite 5 months ago			`return unicode_strupper(dest, destsize, src, srclen,`
			`locale->info.builtin.casemap_full);`
Refactor string case conversion into provider-specific files. Create API entry points pg_strlower(), etc., that work with any provider and give the caller control over the destination buffer. Then, move provider-specific logic into pg_locale_builtin.c, pg_locale_icu.c, and pg_locale_libc.c as appropriate. Discussion: https://postgr.es/m/7aa46d77b377428058403723440862d12a8a129a.camel@j-davis.com 7 months ago			`}`
Perform provider-specific initialization in new functions. Reviewed-by: Andreas Karlsson Discussion: https://postgr.es/m/4548a168-62cd-457b-8d06-9ba7b985c477@proxel.se 7 months ago
Add SQL function CASEFOLD(). Useful for caseless matching. Similar to LOWER(), but avoids edge-case problems with using LOWER() for caseless matching. For collations that support it, CASEFOLD() handles characters with more than two case variations or multi-character case variations. Some characters may fold to uppercase. The results of case folding are also more stable across Unicode versions than LOWER() or UPPER(). Discussion: https://postgr.es/m/a1886ddfcd8f60cb3e905c93009b646b4cfb74c5.camel%40j-davis.com Reviewed-by: Ian Lawrence Barwick 5 months ago			`size_t`
			`strfold_builtin(char dest, size_t destsize, const char src, ssize_t srclen,`
			`pg_locale_t locale)`
			`{`
			`return unicode_strfold(dest, destsize, src, srclen,`
			`locale->info.builtin.casemap_full);`
			`}`

Perform provider-specific initialization in new functions. Reviewed-by: Andreas Karlsson Discussion: https://postgr.es/m/4548a168-62cd-457b-8d06-9ba7b985c477@proxel.se 7 months ago			`pg_locale_t`
			`create_pg_locale_builtin(Oid collid, MemoryContext context)`
			`{`
			`const char *locstr;`
			`pg_locale_t result;`

			`if (collid == DEFAULT_COLLATION_OID)`
			`{`
			`HeapTuple tp;`
			`Datum datum;`

			`tp = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));`
			`if (!HeapTupleIsValid(tp))`
			`elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);`
			`datum = SysCacheGetAttrNotNull(DATABASEOID, tp,`
			`Anum_pg_database_datlocale);`
			`locstr = TextDatumGetCString(datum);`
			`ReleaseSysCache(tp);`
			`}`
			`else`
			`{`
			`HeapTuple tp;`
			`Datum datum;`

			`tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));`
			`if (!HeapTupleIsValid(tp))`
			`elog(ERROR, "cache lookup failed for collation %u", collid);`
			`datum = SysCacheGetAttrNotNull(COLLOID, tp,`
			`Anum_pg_collation_colllocale);`
			`locstr = TextDatumGetCString(datum);`
			`ReleaseSysCache(tp);`
			`}`

			`builtin_validate_locale(GetDatabaseEncoding(), locstr);`

			`result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));`

			`result->info.builtin.locale = MemoryContextStrdup(context, locstr);`
Support PG_UNICODE_FAST locale in the builtin collation provider. The PG_UNICODE_FAST locale uses code point sort order (fast, memcmp-based) combined with Unicode character semantics. The character semantics are based on Unicode full case mapping. Full case mapping can map a single codepoint to multiple codepoints, such as "ß" uppercasing to "SS". Additionally, it handles context-sensitive mappings like the "final sigma", and it uses titlecase mappings such as "ǅ" when titlecasing (rather than plain uppercase mappings). Importantly, the uppercasing of "ß" as "SS" is specifically mentioned by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics for case mapping and pattern matching, so if we changed it to use the PG_UNICODE_FAST locale, it would offer better compliance with the standard. For now, though, do not change the behavior of UCS_BASIC. Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite 5 months ago			`result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);`
Perform provider-specific initialization in new functions. Reviewed-by: Andreas Karlsson Discussion: https://postgr.es/m/4548a168-62cd-457b-8d06-9ba7b985c477@proxel.se 7 months ago			`result->provider = COLLPROVIDER_BUILTIN;`
			`result->deterministic = true;`
			`result->collate_is_c = true;`
			`result->ctype_is_c = (strcmp(locstr, "C") == 0);`

			`return result;`
			`}`
Move code for collation version into provider-specific files. Author: Andreas Karlsson Discussion: https://postgr.es/m/4548a168-62cd-457b-8d06-9ba7b985c477%40proxel.se 6 months ago
			`char *`
			`get_collation_actual_version_builtin(const char *collcollate)`
			`{`
			`/*`
			`* The only two supported locales (C and C.UTF-8) are both based on memcmp`
			`* and are not expected to change, but track the version anyway.`
			`*`
			`* Note that the character semantics may change for some locales, but the`
			`* collation version only tracks changes to sort order.`
			`*/`
			`if (strcmp(collcollate, "C") == 0)`
			`return "1";`
			`else if (strcmp(collcollate, "C.UTF-8") == 0)`
			`return "1";`
Support PG_UNICODE_FAST locale in the builtin collation provider. The PG_UNICODE_FAST locale uses code point sort order (fast, memcmp-based) combined with Unicode character semantics. The character semantics are based on Unicode full case mapping. Full case mapping can map a single codepoint to multiple codepoints, such as "ß" uppercasing to "SS". Additionally, it handles context-sensitive mappings like the "final sigma", and it uses titlecase mappings such as "ǅ" when titlecasing (rather than plain uppercase mappings). Importantly, the uppercasing of "ß" as "SS" is specifically mentioned by the SQL standard. In Postgres, UCS_BASIC uses plain ASCII semantics for case mapping and pattern matching, so if we changed it to use the PG_UNICODE_FAST locale, it would offer better compliance with the standard. For now, though, do not change the behavior of UCS_BASIC. Discussion: https://postgr.es/m/ddfd67928818f138f51635712529bc5e1d25e4e7.camel@j-davis.com Discussion: https://postgr.es/m/27bb0e52-801d-4f73-a0a4-02cfdd4a9ada@eisentraut.org Reviewed-by: Peter Eisentraut, Daniel Verite 5 months ago			`else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)`
			`return "1";`
Move code for collation version into provider-specific files. Author: Andreas Karlsson Discussion: https://postgr.es/m/4548a168-62cd-457b-8d06-9ba7b985c477%40proxel.se 6 months ago			`else`
			`ereport(ERROR,`
			`(errcode(ERRCODE_WRONG_OBJECT_TYPE),`
			`errmsg("invalid locale name \"%s\" for builtin provider",`
			`collcollate)));`

			`return NULL; /* keep compiler quiet */`
			`}`