Add Greek characters to unaccent.rules.

Author: Tasos Maschalidis Reviewed-by: Michael Paquier, Tom Lane Discussion: https://postgr.es/m/153495048900.1368.11566580687623014380%40wrigleys.postgresql.org Discussion: https://postgr.es/m/VI1PR01MB38537EBD529FE5EE3FE9A5FEB5370%40VI1PR01MB3853.eurprd01.prod.exchangelabs.com
7 years ago · 5e8d670c31
parent ec74369931
commit 5e8d670c31
2 changed files with 236 additions and 4 deletions
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@ -29,6 +29,15 @@ import argparse
 import sys
 import xml.etree.ElementTree as ET

+# The ranges of Unicode characters that we consider to be "plain letters".
+# For now we are being conservative by including only Latin and Greek.  This
+# could be extended in future based on feedback from people with relevant
+# language knowledge.
+PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
+                       (ord('A'), ord('Z')), # Latin upper case
+                       (0x03b1, 0x03c9),     # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
+                       (0x0391, 0x03a9))     # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
+
 def print_record(codepoint, letter):
    print (unichr(codepoint) + "\t" + letter).encode("UTF-8")

@ -39,9 +48,11 @@ class Codepoint:
        self.combining_ids = combining_ids

 def is_plain_letter(codepoint):
-    """Return true if codepoint represents a plain ASCII letter."""
-    return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
-           (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+    """Return true if codepoint represents a "plain letter"."""
+    for begin, end in PLAIN_LETTER_RANGES:
+      if codepoint.id >= begin and codepoint.id <= end:
+        return True
+    return False

 def is_mark(codepoint):
    """Returns true for diacritical marks (combining codepoints)."""
@ -184,7 +195,7 @@ def main(args):
           len(codepoint.combining_ids) > 1:
            if is_letter_with_marks(codepoint, table):
                charactersSet.add((codepoint.id,
-                             chr(get_plain_letter(codepoint, table).id)))
+                             unichr(get_plain_letter(codepoint, table).id)))
            elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
                charactersSet.add((codepoint.id,
                             "".join(unichr(combining_codepoint.id)
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@ -399,6 +399,26 @@
 ʦ	ts
 ʪ	ls
 ʫ	lz
+Ά	Α
+Έ	Ε
+Ή	Η
+Ί	Ι
+Ό	Ο
+Ύ	Υ
+Ώ	Ω
+ΐ	ι
+Ϊ	Ι
+Ϋ	Υ
+ά	α
+έ	ε
+ή	η
+ί	ι
+ΰ	υ
+ϊ	ι
+ϋ	υ
+ό	ο
+ύ	υ
+ώ	ω
 Ё	Е
 ё	е
 ᴀ	A
@ -709,6 +729,207 @@
 ỽ	v
 Ỿ	Y
 ỿ	y
+ἀ	α
+ἁ	α
+ἂ	α
+ἃ	α
+ἄ	α
+ἅ	α
+ἆ	α
+ἇ	α
+Ἀ	Α
+Ἁ	Α
+Ἂ	Α
+Ἃ	Α
+Ἄ	Α
+Ἅ	Α
+Ἆ	Α
+Ἇ	Α
+ἐ	ε
+ἑ	ε
+ἒ	ε
+ἓ	ε
+ἔ	ε
+ἕ	ε
+Ἐ	Ε
+Ἑ	Ε
+Ἒ	Ε
+Ἓ	Ε
+Ἔ	Ε
+Ἕ	Ε
+ἠ	η
+ἡ	η
+ἢ	η
+ἣ	η
+ἤ	η
+ἥ	η
+ἦ	η
+ἧ	η
+Ἠ	Η
+Ἡ	Η
+Ἢ	Η
+Ἣ	Η
+Ἤ	Η
+Ἥ	Η
+Ἦ	Η
+Ἧ	Η
+ἰ	ι
+ἱ	ι
+ἲ	ι
+ἳ	ι
+ἴ	ι
+ἵ	ι
+ἶ	ι
+ἷ	ι
+Ἰ	Ι
+Ἱ	Ι
+Ἲ	Ι
+Ἳ	Ι
+Ἴ	Ι
+Ἵ	Ι
+Ἶ	Ι
+Ἷ	Ι
+ὀ	ο
+ὁ	ο
+ὂ	ο
+ὃ	ο
+ὄ	ο
+ὅ	ο
+Ὀ	Ο
+Ὁ	Ο
+Ὂ	Ο
+Ὃ	Ο
+Ὄ	Ο
+Ὅ	Ο
+ὐ	υ
+ὑ	υ
+ὒ	υ
+ὓ	υ
+ὔ	υ
+ὕ	υ
+ὖ	υ
+ὗ	υ
+Ὑ	Υ
+Ὓ	Υ
+Ὕ	Υ
+Ὗ	Υ
+ὠ	ω
+ὡ	ω
+ὢ	ω
+ὣ	ω
+ὤ	ω
+ὥ	ω
+ὦ	ω
+ὧ	ω
+Ὠ	Ω
+Ὡ	Ω
+Ὢ	Ω
+Ὣ	Ω
+Ὤ	Ω
+Ὥ	Ω
+Ὦ	Ω
+Ὧ	Ω
+ὰ	α
+ὲ	ε
+ὴ	η
+ὶ	ι
+ὸ	ο
+ὺ	υ
+ὼ	ω
+ᾀ	α
+ᾁ	α
+ᾂ	α
+ᾃ	α
+ᾄ	α
+ᾅ	α
+ᾆ	α
+ᾇ	α
+ᾈ	Α
+ᾉ	Α
+ᾊ	Α
+ᾋ	Α
+ᾌ	Α
+ᾍ	Α
+ᾎ	Α
+ᾏ	Α
+ᾐ	η
+ᾑ	η
+ᾒ	η
+ᾓ	η
+ᾔ	η
+ᾕ	η
+ᾖ	η
+ᾗ	η
+ᾘ	Η
+ᾙ	Η
+ᾚ	Η
+ᾛ	Η
+ᾜ	Η
+ᾝ	Η
+ᾞ	Η
+ᾟ	Η
+ᾠ	ω
+ᾡ	ω
+ᾢ	ω
+ᾣ	ω
+ᾤ	ω
+ᾥ	ω
+ᾦ	ω
+ᾧ	ω
+ᾨ	Ω
+ᾩ	Ω
+ᾪ	Ω
+ᾫ	Ω
+ᾬ	Ω
+ᾭ	Ω
+ᾮ	Ω
+ᾯ	Ω
+ᾰ	α
+ᾱ	α
+ᾲ	α
+ᾳ	α
+ᾴ	α
+ᾶ	α
+ᾷ	α
+Ᾰ	Α
+Ᾱ	Α
+Ὰ	Α
+ᾼ	Α
+ῂ	η
+ῃ	η
+ῄ	η
+ῆ	η
+ῇ	η
+Ὲ	Ε
+Ὴ	Η
+ῌ	Η
+ῐ	ι
+ῑ	ι
+ῒ	ι
+ῖ	ι
+ῗ	ι
+Ῐ	Ι
+Ῑ	Ι
+Ὶ	Ι
+ῠ	υ
+ῡ	υ
+ῢ	υ
+ῤ	ρ
+ῥ	ρ
+ῦ	υ
+ῧ	υ
+Ῠ	Υ
+Ῡ	Υ
+Ὺ	Υ
+Ῥ	Ρ
+ῲ	ω
+ῳ	ω
+ῴ	ω
+ῶ	ω
+ῷ	ω
+Ὸ	Ο
+Ὼ	Ω
+ῼ	Ω
 ‐	-
 ‑	-
 ‒	-