You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
postgres/src/include/utils/pg_locale.h

127 lines
3.6 KiB

/*-----------------------------------------------------------------------
*
* PostgreSQL locale utilities
*
* src/include/utils/pg_locale.h
*
* Copyright (c) 2002-2022, PostgreSQL Global Development Group
*
*-----------------------------------------------------------------------
*/
#ifndef _PG_LOCALE_
#define _PG_LOCALE_
#if defined(LOCALE_T_IN_XLOCALE) || defined(WCSTOMBS_L_IN_XLOCALE)
#include <xlocale.h>
#endif
#ifdef USE_ICU
#include <unicode/ucol.h>
#endif
#include "utils/guc.h"
#ifdef USE_ICU
/*
* ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
* (see
* <https://www.postgresql.org/message-id/flat/f1438ec6-22aa-4029-9a3b-26f79d330e72%40manitou-mail.org>)
*/
#if U_ICU_VERSION_MAJOR_NUM >= 53
#define HAVE_UCOL_STRCOLLUTF8 1
#else
#undef HAVE_UCOL_STRCOLLUTF8
#endif
#endif
/* use for libc locale names */
#define LOCALE_NAME_BUFLEN 128
/* GUC settings */
extern char *locale_messages;
extern char *locale_monetary;
extern char *locale_numeric;
extern char *locale_time;
/* lc_time localization cache */
extern char *localized_abbrev_days[];
extern char *localized_full_days[];
extern char *localized_abbrev_months[];
extern char *localized_full_months[];
extern bool check_locale_messages(char **newval, void **extra, GucSource source);
extern void assign_locale_messages(const char *newval, void *extra);
extern bool check_locale_monetary(char **newval, void **extra, GucSource source);
extern void assign_locale_monetary(const char *newval, void *extra);
extern bool check_locale_numeric(char **newval, void **extra, GucSource source);
extern void assign_locale_numeric(const char *newval, void *extra);
extern bool check_locale_time(char **newval, void **extra, GucSource source);
extern void assign_locale_time(const char *newval, void *extra);
extern bool check_locale(int category, const char *locale, char **canonname);
extern char *pg_perm_setlocale(int category, const char *locale);
extern void check_strxfrm_bug(void);
extern bool lc_collate_is_c(Oid collation);
extern bool lc_ctype_is_c(Oid collation);
/*
* Return the POSIX lconv struct (contains number/money formatting
* information) with locale information for all categories.
*/
extern struct lconv *PGLC_localeconv(void);
extern void cache_locale_time(void);
/*
* We define our own wrapper around locale_t so we can keep the same
* function signatures for all builds, while not having to create a
* fake version of the standard type locale_t in the global namespace.
* pg_locale_t is occasionally checked for truth, so make it a pointer.
*/
struct pg_locale_struct
{
char provider;
Collations with nondeterministic comparison This adds a flag "deterministic" to collations. If that is false, such a collation disables various optimizations that assume that strings are equal only if they are byte-wise equal. That then allows use cases such as case-insensitive or accent-insensitive comparisons or handling of strings with different Unicode normal forms. This functionality is only supported with the ICU provider. At least glibc doesn't appear to have any locales that work in a nondeterministic way, so it's not worth supporting this for the libc provider. The term "deterministic comparison" in this context is from Unicode Technical Standard #10 (https://unicode.org/reports/tr10/#Deterministic_Comparison). This patch makes changes in three areas: - CREATE COLLATION DDL changes and system catalog changes to support this new flag. - Many executor nodes and auxiliary code are extended to track collations. Previously, this code would just throw away collation information, because the eventually-called user-defined functions didn't use it since they only cared about equality, which didn't need collation information. - String data type functions that do equality comparisons and hashing are changed to take the (non-)deterministic flag into account. For comparison, this just means skipping various shortcuts and tie breakers that use byte-wise comparison. For hashing, we first need to convert the input string to a canonical "sort key" using the ICU analogue of strxfrm(). Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Peter Geoghegan <pg@bowt.ie> Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
7 years ago
bool deterministic;
union
{
#ifdef HAVE_LOCALE_T
locale_t lt;
#endif
#ifdef USE_ICU
struct
{
const char *locale;
UCollator *ucol;
} icu;
#endif
int dummy; /* in case we have neither LOCALE_T nor ICU */
} info;
};
typedef struct pg_locale_struct *pg_locale_t;
extern struct pg_locale_struct default_locale;
extern void make_icu_collator(const char *iculocstr,
struct pg_locale_struct *resultp);
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
extern char *get_collation_actual_version(char collprovider, const char *collcollate);
#ifdef USE_ICU
extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
Fix memory leakage in ICU encoding conversion, and other code review. Callers of icu_to_uchar() neglected to pfree the result string when done with it. This results in catastrophic memory leaks in varstr_cmp(), because of our prevailing assumption that btree comparison functions don't leak memory. For safety, make all the call sites clean up leaks, though I suspect that we could get away without it in formatting.c. I audited callers of icu_from_uchar() as well, but found no places that seemed to have a comparable issue. Add function API specifications for icu_to_uchar() and icu_from_uchar(); the lack of any thought-through specification is perhaps not unrelated to the existence of this bug in the first place. Fix icu_to_uchar() to guarantee a nul-terminated result; although no existing caller appears to care, the fact that it would have been nul-terminated except in extreme corner cases seems ideally designed to bite someone on the rear someday. Fix ucnv_fromUChars() destCapacity argument --- in the worst case, that could perhaps have led to a non-nul-terminated result, too. Fix icu_from_uchar() to have a more reasonable definition of the function result --- no callers are actually paying attention, so this isn't a live bug, but it's certainly sloppily designed. Const-ify icu_from_uchar()'s input string for consistency. That is not the end of what needs to be done to these functions, but it's as much as I have the patience for right now. Discussion: https://postgr.es/m/1955.1498181798@sss.pgh.pa.us
8 years ago
extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
#endif
/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
pg_locale_t locale);
extern size_t char2wchar(wchar_t *to, size_t tolen,
const char *from, size_t fromlen, pg_locale_t locale);
Phase 2 of pgindent updates. Change pg_bsd_indent to follow upstream rules for placement of comments to the right of code, and remove pgindent hack that caused comments following #endif to not obey the general rule. Commit e3860ffa4dd0dad0dd9eea4be9cc1412373a8c89 wasn't actually using the published version of pg_bsd_indent, but a hacked-up version that tried to minimize the amount of movement of comments to the right of code. The situation of interest is where such a comment has to be moved to the right of its default placement at column 33 because there's code there. BSD indent has always moved right in units of tab stops in such cases --- but in the previous incarnation, indent was working in 8-space tab stops, while now it knows we use 4-space tabs. So the net result is that in about half the cases, such comments are placed one tab stop left of before. This is better all around: it leaves more room on the line for comment text, and it means that in such cases the comment uniformly starts at the next 4-space tab stop after the code, rather than sometimes one and sometimes two tabs after. Also, ensure that comments following #endif are indented the same as comments following other preprocessor commands such as #else. That inconsistency turns out to have been self-inflicted damage from a poorly-thought-through post-indent "fixup" in pgindent. This patch is much less interesting than the first round of indent changes, but also bulkier, so I thought it best to separate the effects. Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
8 years ago
#endif /* _PG_LOCALE_ */