|
|
|
@ -386,11 +386,12 @@ initdb --locale-provider=icu --icu-locale=en |
|
|
|
|
linkend="icu-language-tag">Language Tag</link>. |
|
|
|
|
|
|
|
|
|
<programlisting> |
|
|
|
|
CREATE COLLATION mycollation1 (PROVIDER = icu, LOCALE = 'ja-JP'); |
|
|
|
|
CREATE COLLATION mycollation2 (PROVIDER = icu, LOCALE = 'fr'); |
|
|
|
|
CREATE COLLATION mycollation1 (provider = icu, locale = 'ja-JP'); |
|
|
|
|
CREATE COLLATION mycollation2 (provider = icu, locale = 'fr'); |
|
|
|
|
</programlisting> |
|
|
|
|
</para> |
|
|
|
|
</sect3> |
|
|
|
|
|
|
|
|
|
<sect3 id="icu-canonicalization"> |
|
|
|
|
<title>Locale Canonicalization and Validation</title> |
|
|
|
|
<para> |
|
|
|
@ -399,14 +400,14 @@ CREATE COLLATION mycollation2 (PROVIDER = icu, LOCALE = 'fr'); |
|
|
|
|
language tag if not already in that form. For instance, |
|
|
|
|
|
|
|
|
|
<screen> |
|
|
|
|
CREATE COLLATION mycollation3 (PROVIDER = icu, LOCALE = 'en-US-u-kn-true'); |
|
|
|
|
CREATE COLLATION mycollation3 (provider = icu, locale = 'en-US-u-kn-true'); |
|
|
|
|
NOTICE: using standard form "en-US-u-kn" for locale "en-US-u-kn-true" |
|
|
|
|
CREATE COLLATION mycollation4 (PROVIDER = icu, LOCALE = 'de_DE.utf8'); |
|
|
|
|
CREATE COLLATION mycollation4 (provider = icu, locale = 'de_DE.utf8'); |
|
|
|
|
NOTICE: using standard form "de-DE" for locale "de_DE.utf8" |
|
|
|
|
</screen> |
|
|
|
|
|
|
|
|
|
If you see this notice, ensure that the <symbol>PROVIDER</symbol> and |
|
|
|
|
<symbol>LOCALE</symbol> are the expected result. For consistent results |
|
|
|
|
If you see this notice, ensure that the <symbol>provider</symbol> and |
|
|
|
|
<symbol>locale</symbol> are the expected result. For consistent results |
|
|
|
|
when using the ICU provider, specify the canonical <link |
|
|
|
|
linkend="icu-language-tag">language tag</link> instead of relying on the |
|
|
|
|
transformation. |
|
|
|
@ -427,7 +428,7 @@ NOTICE: using standard form "de-DE" for locale "de_DE.utf8" |
|
|
|
|
the following warning: |
|
|
|
|
|
|
|
|
|
<screen> |
|
|
|
|
CREATE COLLATION nonsense (PROVIDER = icu, LOCALE = 'nonsense'); |
|
|
|
|
CREATE COLLATION nonsense (provider = icu, locale = 'nonsense'); |
|
|
|
|
WARNING: ICU locale "nonsense" has unknown language "nonsense" |
|
|
|
|
HINT: To disable ICU locale validation, set parameter icu_validation_level to DISABLED. |
|
|
|
|
CREATE COLLATION |
|
|
|
@ -438,6 +439,7 @@ CREATE COLLATION |
|
|
|
|
still be created, but the behavior may not be what the user intended. |
|
|
|
|
</para> |
|
|
|
|
</sect3> |
|
|
|
|
|
|
|
|
|
<sect3 id="icu-language-tag"> |
|
|
|
|
<title>Language Tag</title> |
|
|
|
|
<para> |
|
|
|
@ -484,7 +486,7 @@ CREATE COLLATION |
|
|
|
|
of digits as a single number: |
|
|
|
|
|
|
|
|
|
<screen> |
|
|
|
|
CREATE COLLATION mycollation5 (PROVIDER = icu, DETERMINISTIC = false, LOCALE = 'en-US-u-kn-ks-level2'); |
|
|
|
|
CREATE COLLATION mycollation5 (provider = icu, deterministic = false, locale = 'en-US-u-kn-ks-level2'); |
|
|
|
|
SELECT 'aB' = 'Ab' COLLATE mycollation5 as result; |
|
|
|
|
result |
|
|
|
|
-------- |
|
|
|
@ -1109,16 +1111,16 @@ CREATE COLLATION ignore_accents (provider = icu, locale = 'und-u-ks-level1-kc-tr |
|
|
|
|
|
|
|
|
|
<programlisting> |
|
|
|
|
-- ignore differences in accents and case |
|
|
|
|
CREATE COLLATION ignore_accent_case (PROVIDER = icu, DETERMINISTIC = false, LOCALE = 'und-u-ks-level1'); |
|
|
|
|
CREATE COLLATION ignore_accent_case (provider = icu, deterministic = false, locale = 'und-u-ks-level1'); |
|
|
|
|
SELECT 'Å' = 'A' COLLATE ignore_accent_case; -- true |
|
|
|
|
SELECT 'z' = 'Z' COLLATE ignore_accent_case; -- true |
|
|
|
|
|
|
|
|
|
-- upper case letters sort before lower case. |
|
|
|
|
CREATE COLLATION upper_first (PROVIDER=icu, LOCALE = 'und-u-kf-upper'); |
|
|
|
|
CREATE COLLATION upper_first (provider = icu, locale = 'und-u-kf-upper'); |
|
|
|
|
SELECT 'B' < 'b' COLLATE upper_first; -- true |
|
|
|
|
|
|
|
|
|
-- treat digits numerically and ignore punctuation |
|
|
|
|
CREATE COLLATION num_ignore_punct (PROVIDER = icu, DETERMINISTIC = false, LOCALE = 'und-u-ka-shifted-kn'); |
|
|
|
|
CREATE COLLATION num_ignore_punct (provider = icu, deterministic = false, locale = 'und-u-ka-shifted-kn'); |
|
|
|
|
SELECT 'id-45' < 'id-123' COLLATE num_ignore_punct; -- true |
|
|
|
|
SELECT 'w;x*y-z' = 'wxyz' COLLATE num_ignore_punct; -- true |
|
|
|
|
</programlisting> |
|
|
|
@ -1136,6 +1138,13 @@ SELECT 'w;x*y-z' = 'wxyz' COLLATE num_ignore_punct; -- true |
|
|
|
|
linkend="icu-collation-settings-table">collation settings</link>. Higher |
|
|
|
|
levels correspond to finer textual features. |
|
|
|
|
</para> |
|
|
|
|
<para> |
|
|
|
|
<xref linkend="icu-collation-levels"/> shows which textual feature |
|
|
|
|
differences are considered significant when determining equality at the |
|
|
|
|
given level. The unicode character <literal>U+2063</literal> is an |
|
|
|
|
invisible separator, and as seen in the table, is ignored for at all |
|
|
|
|
levels of comparison less than <literal>identic</literal>. |
|
|
|
|
</para> |
|
|
|
|
<para> |
|
|
|
|
<table id="icu-collation-levels"> |
|
|
|
|
<title>ICU Collation Levels</title> |
|
|
|
@ -1215,20 +1224,13 @@ SELECT 'w;x*y-z' = 'wxyz' COLLATE num_ignore_punct; -- true |
|
|
|
|
</tgroup> |
|
|
|
|
</table> |
|
|
|
|
|
|
|
|
|
The above table shows which textual feature differences are |
|
|
|
|
considered significant when determining equality at the given level. The |
|
|
|
|
unicode character <literal>U+2063</literal> is an invisible separator, |
|
|
|
|
and as seen in the table, is ignored for at all levels of comparison less |
|
|
|
|
than <literal>identic</literal>. |
|
|
|
|
</para> |
|
|
|
|
<para> |
|
|
|
|
At every level, even with full normalization off, basic normalization is |
|
|
|
|
performed. For example, <literal>'á'</literal> may be composed of the |
|
|
|
|
code points <literal>U&'\0061\0301'</literal> or the single code |
|
|
|
|
point <literal>U&'\00E1'</literal>, and those sequences will be |
|
|
|
|
considered equal even at the <literal>identic</literal> level. To treat |
|
|
|
|
any difference in code point representation as distinct, use a collation |
|
|
|
|
created with <symbol>DETERMINISTIC</symbol> set to |
|
|
|
|
created with <symbol>deterministic</symbol> set to |
|
|
|
|
<literal>true</literal>. |
|
|
|
|
</para> |
|
|
|
|
<sect4 id="icu-collation-level-examples"> |
|
|
|
@ -1236,9 +1238,9 @@ SELECT 'w;x*y-z' = 'wxyz' COLLATE num_ignore_punct; -- true |
|
|
|
|
<para> |
|
|
|
|
|
|
|
|
|
<programlisting> |
|
|
|
|
CREATE COLLATION level3 (PROVIDER=icu, DETERMINISTIC=false, LOCALE='und-u-ka-shifted-ks-level3'); |
|
|
|
|
CREATE COLLATION level4 (PROVIDER=icu, DETERMINISTIC=false, LOCALE='und-u-ka-shifted-ks-level4'); |
|
|
|
|
CREATE COLLATION identic (PROVIDER=icu, DETERMINISTIC=false, LOCALE='und-u-ka-shifted-ks-identic'); |
|
|
|
|
CREATE COLLATION level3 (provider = icu, deterministic = false, locale = 'und-u-ka-shifted-ks-level3'); |
|
|
|
|
CREATE COLLATION level4 (provider = icu, deterministic = false, locale = 'und-u-ka-shifted-ks-level4'); |
|
|
|
|
CREATE COLLATION identic (provider = icu, deterministic = false, locale = 'und-u-ka-shifted-ks-identic'); |
|
|
|
|
|
|
|
|
|
-- invisible separator ignored at all levels except identic |
|
|
|
|
SELECT 'ab' = U&'a\2063b' COLLATE level4; -- true |
|
|
|
@ -1252,8 +1254,14 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false |
|
|
|
|
</para> |
|
|
|
|
</sect4> |
|
|
|
|
</sect3> |
|
|
|
|
|
|
|
|
|
<sect3 id="icu-collation-settings"> |
|
|
|
|
<title>Collation Settings for an ICU Locale</title> |
|
|
|
|
<para> |
|
|
|
|
<xref linkend="icu-collation-settings-table"/> shows the available |
|
|
|
|
collation settings, which can be used as part of a language tag to |
|
|
|
|
customize a collation. |
|
|
|
|
</para> |
|
|
|
|
<para> |
|
|
|
|
<table id="icu-collation-settings-table"> |
|
|
|
|
<title>ICU Collation Settings</title> |
|
|
|
@ -1272,14 +1280,11 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false |
|
|
|
|
</thead> |
|
|
|
|
<tbody> |
|
|
|
|
<row> |
|
|
|
|
<entry><literal>ks</literal></entry> |
|
|
|
|
<entry><literal>level1</literal>, <literal>level2</literal>, <literal>level3</literal>, <literal>level4</literal>, <literal>identic</literal></entry> |
|
|
|
|
<entry><literal>level3</literal></entry> |
|
|
|
|
<entry><literal>co</literal></entry> |
|
|
|
|
<entry><literal>emoji</literal>, <literal>phonebk</literal>, <literal>standard</literal>, <replaceable>...</replaceable></entry> |
|
|
|
|
<entry><literal>standard</literal></entry> |
|
|
|
|
<entry> |
|
|
|
|
Sensitivity (or "strength") when determining equality, with |
|
|
|
|
<literal>level1</literal> the least sensitive to differences and |
|
|
|
|
<literal>identic</literal> the most sensitive to differences. See |
|
|
|
|
<xref linkend="icu-collation-levels"/> for details. |
|
|
|
|
Collation type. See <xref linkend="icu-external-references"/> for additional options and details. |
|
|
|
|
</entry> |
|
|
|
|
</row> |
|
|
|
|
<row> |
|
|
|
@ -1304,29 +1309,6 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false |
|
|
|
|
before <literal>'aé'</literal>. |
|
|
|
|
</entry> |
|
|
|
|
</row> |
|
|
|
|
<row> |
|
|
|
|
<entry><literal>kk</literal></entry> |
|
|
|
|
<entry><literal>true</literal>, <literal>false</literal></entry> |
|
|
|
|
<entry><literal>false</literal></entry> |
|
|
|
|
<entry> |
|
|
|
|
<para> |
|
|
|
|
Enable full normalization; may affect performance. Basic |
|
|
|
|
normalization is performed even when set to |
|
|
|
|
<literal>false</literal>. Locales for languages that require full |
|
|
|
|
normalization typically enable it by default. |
|
|
|
|
</para> |
|
|
|
|
<para> |
|
|
|
|
Full normalization is important in some cases, such as when |
|
|
|
|
multiple accents are applied to a single character. For example, |
|
|
|
|
the code point sequences <literal>U&'\0065\0323\0302'</literal> |
|
|
|
|
and <literal>U&'\0065\0302\0323'</literal> represent |
|
|
|
|
an <literal>e</literal> with circumflex and dot-below accents |
|
|
|
|
applied in different orders. With full normalization |
|
|
|
|
on, these code point sequences are treated as equal; otherwise they |
|
|
|
|
are unequal. |
|
|
|
|
</para> |
|
|
|
|
</entry> |
|
|
|
|
</row> |
|
|
|
|
<row> |
|
|
|
|
<entry><literal>kc</literal></entry> |
|
|
|
|
<entry><literal>true</literal>, <literal>false</literal></entry> |
|
|
|
@ -1368,6 +1350,29 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false |
|
|
|
|
<literal>'id-123'</literal>. |
|
|
|
|
</entry> |
|
|
|
|
</row> |
|
|
|
|
<row> |
|
|
|
|
<entry><literal>kk</literal></entry> |
|
|
|
|
<entry><literal>true</literal>, <literal>false</literal></entry> |
|
|
|
|
<entry><literal>false</literal></entry> |
|
|
|
|
<entry> |
|
|
|
|
<para> |
|
|
|
|
Enable full normalization; may affect performance. Basic |
|
|
|
|
normalization is performed even when set to |
|
|
|
|
<literal>false</literal>. Locales for languages that require full |
|
|
|
|
normalization typically enable it by default. |
|
|
|
|
</para> |
|
|
|
|
<para> |
|
|
|
|
Full normalization is important in some cases, such as when |
|
|
|
|
multiple accents are applied to a single character. For example, |
|
|
|
|
the code point sequences <literal>U&'\0065\0323\0302'</literal> |
|
|
|
|
and <literal>U&'\0065\0302\0323'</literal> represent |
|
|
|
|
an <literal>e</literal> with circumflex and dot-below accents |
|
|
|
|
applied in different orders. With full normalization |
|
|
|
|
on, these code point sequences are treated as equal; otherwise they |
|
|
|
|
are unequal. |
|
|
|
|
</para> |
|
|
|
|
</entry> |
|
|
|
|
</row> |
|
|
|
|
<row> |
|
|
|
|
<entry><literal>kr</literal></entry> |
|
|
|
|
<entry> |
|
|
|
@ -1393,6 +1398,17 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false |
|
|
|
|
</para> |
|
|
|
|
</entry> |
|
|
|
|
</row> |
|
|
|
|
<row> |
|
|
|
|
<entry><literal>ks</literal></entry> |
|
|
|
|
<entry><literal>level1</literal>, <literal>level2</literal>, <literal>level3</literal>, <literal>level4</literal>, <literal>identic</literal></entry> |
|
|
|
|
<entry><literal>level3</literal></entry> |
|
|
|
|
<entry> |
|
|
|
|
Sensitivity (or "strength") when determining equality, with |
|
|
|
|
<literal>level1</literal> the least sensitive to differences and |
|
|
|
|
<literal>identic</literal> the most sensitive to differences. See |
|
|
|
|
<xref linkend="icu-collation-levels"/> for details. |
|
|
|
|
</entry> |
|
|
|
|
</row> |
|
|
|
|
<row> |
|
|
|
|
<entry><literal>kv</literal></entry> |
|
|
|
|
<entry> |
|
|
|
@ -1410,14 +1426,6 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false |
|
|
|
|
to <literal>level3</literal> or lower to take effect. |
|
|
|
|
</entry> |
|
|
|
|
</row> |
|
|
|
|
<row> |
|
|
|
|
<entry><literal>co</literal></entry> |
|
|
|
|
<entry><literal>emoji</literal>, <literal>phonebk</literal>, <literal>standard</literal>, <replaceable>...</replaceable></entry> |
|
|
|
|
<entry><literal>standard</literal></entry> |
|
|
|
|
<entry> |
|
|
|
|
Collation type. See <xref linkend="icu-external-references"/> for additional options and details. |
|
|
|
|
</entry> |
|
|
|
|
</row> |
|
|
|
|
</tbody> |
|
|
|
|
</tgroup> |
|
|
|
|
</table> |
|
|
|
@ -1428,7 +1436,7 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false |
|
|
|
|
<note> |
|
|
|
|
<para> |
|
|
|
|
For many collation settings, you must create the collation with |
|
|
|
|
<option>DETERMINISTIC</option> set to <literal>false</literal> for the |
|
|
|
|
<option>deterministic</option> set to <literal>false</literal> for the |
|
|
|
|
setting to have the desired effect (see <xref |
|
|
|
|
linkend="collation-nondeterministic"/>). Additionally, some settings |
|
|
|
|
only take effect when the key <literal>ka</literal> is set to |
|
|
|
@ -1437,6 +1445,7 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false |
|
|
|
|
</para> |
|
|
|
|
</note> |
|
|
|
|
</sect3> |
|
|
|
|
|
|
|
|
|
<sect3 id="icu-locale-examples"> |
|
|
|
|
<title>Examples</title> |
|
|
|
|
<para> |
|
|
|
@ -1487,6 +1496,7 @@ SELECT 'x-y' = 'x_y' COLLATE level4; -- false |
|
|
|
|
</variablelist> |
|
|
|
|
</para> |
|
|
|
|
</sect3> |
|
|
|
|
|
|
|
|
|
<sect3 id="icu-external-references"> |
|
|
|
|
<title>External References for ICU</title> |
|
|
|
|
<para> |
|
|
|
|