mirror of https://github.com/postgres/postgres
The builtin C.UTF-8 locale has similar semantics to the libc locale of the same name. That is, code point sort order (fast, memcmp-based) combined with Unicode semantics for character operations such as pattern matching, regular expressions, and LOWER()/INITCAP()/UPPER(). The character semantics are based on Unicode simple case mappings. The builtin provider's C.UTF-8 offers several important advantages over libc: * faster sorting -- benefits from additional optimizations such as abbreviated keys and varstrfastcmp_c * faster case conversion, e.g. LOWER(), at least compared with some libc implementations * available on all platforms with identical semantics, and the semantics are stable, testable, and documentable within a given Postgres major version Being based on memcmp, the builtin C.UTF-8 locale does not offer natural language sort order. But it is an improvement for most use cases that might otherwise use libc's "C.UTF-8" locale, as well as many use cases that use libc's "C" locale. Discussion: https://postgr.es/m/ff4c2f2f9c8fc7ca27c1c24ae37ecaeaeaff6b53.camel%40j-davis.com Reviewed-by: Daniel Vérité, Peter Eisentraut, Jeremy Schneiderpull/159/head
parent
fd0398fcb0
commit
f69319f2f1
@ -0,0 +1,136 @@ |
||||
/* |
||||
* This test is for collations and character operations when using the |
||||
* builtin provider with the C.UTF-8 locale. |
||||
*/ |
||||
/* skip test if not UTF8 server encoding */ |
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
SET client_encoding TO UTF8; |
||||
-- |
||||
-- Test PG_C_UTF8 |
||||
-- |
||||
CREATE COLLATION regress_pg_c_utf8 ( |
||||
provider = builtin, locale = 'C_UTF8'); -- fails |
||||
ERROR: invalid locale name "C_UTF8" for builtin provider |
||||
CREATE COLLATION regress_pg_c_utf8 ( |
||||
provider = builtin, locale = 'C.UTF8'); |
||||
DROP COLLATION regress_pg_c_utf8; |
||||
CREATE COLLATION regress_pg_c_utf8 ( |
||||
provider = builtin, locale = 'C.UTF-8'); |
||||
CREATE TABLE test_pg_c_utf8 ( |
||||
t TEXT COLLATE PG_C_UTF8 |
||||
); |
||||
INSERT INTO test_pg_c_utf8 VALUES |
||||
('abc DEF 123abc'), |
||||
('ábc sßs ßss DÉF'), |
||||
('DŽxxDŽ džxxDž Džxxdž'), |
||||
('ȺȺȺ'), |
||||
('ⱥⱥⱥ'), |
||||
('ⱥȺ'); |
||||
SELECT |
||||
t, lower(t), initcap(t), upper(t), |
||||
length(convert_to(t, 'UTF8')) AS t_bytes, |
||||
length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, |
||||
length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, |
||||
length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes |
||||
FROM test_pg_c_utf8; |
||||
t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes |
||||
-----------------+-----------------+-----------------+-----------------+---------+---------------+-----------------+--------------- |
||||
abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14 |
||||
ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs ßss Déf | ÁBC SßS ßSS DÉF | 19 | 19 | 19 | 19 |
||||
DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | DŽxxdž DŽxxdž DŽxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20 |
||||
ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6 |
||||
ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6 |
||||
ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4 |
||||
(6 rows) |
||||
|
||||
DROP TABLE test_pg_c_utf8; |
||||
-- negative test: Final_Sigma not used for builtin locale C.UTF-8 |
||||
SELECT lower('ΑΣ' COLLATE PG_C_UTF8); |
||||
lower |
||||
------- |
||||
ασ |
||||
(1 row) |
||||
|
||||
SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8); |
||||
lower |
||||
------- |
||||
αͺσͺ |
||||
(1 row) |
||||
|
||||
SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8); |
||||
lower |
||||
------- |
||||
α΄σ΄ |
||||
(1 row) |
||||
|
||||
-- properties |
||||
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
-- case mapping |
||||
SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed |
||||
?column? |
||||
---------- |
||||
t |
||||
(1 row) |
||||
|
||||
@ -0,0 +1,8 @@ |
||||
/* |
||||
* This test is for collations and character operations when using the |
||||
* builtin provider with the C.UTF-8 locale. |
||||
*/ |
||||
/* skip test if not UTF8 server encoding */ |
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
@ -0,0 +1,67 @@ |
||||
/* |
||||
* This test is for collations and character operations when using the |
||||
* builtin provider with the C.UTF-8 locale. |
||||
*/ |
||||
|
||||
/* skip test if not UTF8 server encoding */ |
||||
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
||||
\if :skip_test |
||||
\quit |
||||
\endif |
||||
|
||||
SET client_encoding TO UTF8; |
||||
|
||||
-- |
||||
-- Test PG_C_UTF8 |
||||
-- |
||||
|
||||
CREATE COLLATION regress_pg_c_utf8 ( |
||||
provider = builtin, locale = 'C_UTF8'); -- fails |
||||
CREATE COLLATION regress_pg_c_utf8 ( |
||||
provider = builtin, locale = 'C.UTF8'); |
||||
DROP COLLATION regress_pg_c_utf8; |
||||
CREATE COLLATION regress_pg_c_utf8 ( |
||||
provider = builtin, locale = 'C.UTF-8'); |
||||
|
||||
CREATE TABLE test_pg_c_utf8 ( |
||||
t TEXT COLLATE PG_C_UTF8 |
||||
); |
||||
INSERT INTO test_pg_c_utf8 VALUES |
||||
('abc DEF 123abc'), |
||||
('ábc sßs ßss DÉF'), |
||||
('DŽxxDŽ džxxDž Džxxdž'), |
||||
('ȺȺȺ'), |
||||
('ⱥⱥⱥ'), |
||||
('ⱥȺ'); |
||||
|
||||
SELECT |
||||
t, lower(t), initcap(t), upper(t), |
||||
length(convert_to(t, 'UTF8')) AS t_bytes, |
||||
length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes, |
||||
length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes, |
||||
length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes |
||||
FROM test_pg_c_utf8; |
||||
|
||||
DROP TABLE test_pg_c_utf8; |
||||
|
||||
-- negative test: Final_Sigma not used for builtin locale C.UTF-8 |
||||
SELECT lower('ΑΣ' COLLATE PG_C_UTF8); |
||||
SELECT lower('ΑͺΣͺ' COLLATE PG_C_UTF8); |
||||
SELECT lower('Α΄Σ΄' COLLATE PG_C_UTF8); |
||||
|
||||
-- properties |
||||
|
||||
SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_C_UTF8; |
||||
SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_C_UTF8; |
||||
SELECT '@' !~ '[[:alnum:]]' COLLATE PG_C_UTF8; |
||||
SELECT '=' ~ '[[:punct:]]' COLLATE PG_C_UTF8; -- symbols are punctuation in posix |
||||
SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_C_UTF8; |
||||
SELECT '൧' !~ '\d' COLLATE PG_C_UTF8; -- only 0-9 considered digits in posix |
||||
|
||||
-- case mapping |
||||
|
||||
SELECT 'xYz' ~* 'XyZ' COLLATE PG_C_UTF8; |
||||
SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8; |
||||
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8; |
||||
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8; |
||||
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed |
||||
Loading…
Reference in new issue