Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly

Add Python script for buiding unaccent.rules from Unicode data. Don't backpatch because unaccent changes may require tsvector/index rebuild. Thomas Munro <thomas.munro@enterprisedb.com>
10 years ago · 1bbd52cb9a
parent 4aec49899e
commit 1bbd52cb9a
2 changed files with 415 additions and 66 deletions
--- a/contrib/unaccent/generate_unaccent_rules.py
+++ b/contrib/unaccent/generate_unaccent_rules.py
@ -0,0 +1,123 @@
+#!/usr/bin/python
+#
+# This script builds unaccent.rules on standard output when given the
+# contents of UnicodeData.txt[1] on standard input.  Optionally includes
+# ligature expansion, if --expand-ligatures is given on the command line.
+#
+# The approach is to use the Unicode decomposition data to identify
+# precomposed codepoints that are equivalent to a ligature of several
+# letters, or a base letter with any number of diacritical marks.
+# There is also a small set of special cases for codepoints that we
+# traditionally support even though Unicode doesn't consider them to
+# be ligatures or letters with marks.
+#
+# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
+
+import re
+import sys
+
+def print_record(codepoint, letter):
+    print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
+
+class Codepoint:
+    def __init__(self, id, general_category, combining_ids):
+        self.id = id
+        self.general_category = general_category
+        self.combining_ids = combining_ids
+
+def is_plain_letter(codepoint):
+    """Return true if codepoint represents a plain ASCII letter."""
+    return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
+           (codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
+
+def is_mark(codepoint):
+    """Returns true for diacritical marks (combining codepoints)."""
+    return codepoint.general_category in ("Mn", "Me", "Mc")
+
+def is_letter_with_marks(codepoint, table):
+    """Returns true for plain letters combined with one or more marks."""
+    # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
+    return len(codepoint.combining_ids) > 1 and \
+           is_plain_letter(table[codepoint.combining_ids[0]]) and \
+           all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
+
+def is_letter(codepoint, table):
+    """Return true for letter with or without diacritical marks."""
+    return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
+
+def get_plain_letter(codepoint, table):
+    """Return the base codepoint without marks."""
+    if is_letter_with_marks(codepoint, table):
+        return table[codepoint.combining_ids[0]]
+    elif is_plain_letter(codepoint):
+        return codepoint
+    else:
+        raise "mu"
+
+def is_ligature(codepoint, table):
+    """Return true for letters combined with letters."""
+    return all(is_letter(table[i], table) for i in codepoint.combining_ids)
+
+def get_plain_letters(codepoint, table):
+    """Return a list of plain letters from a ligature."""
+    assert(is_ligature(codepoint, table))
+    return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
+
+def main(expand_ligatures):
+    # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
+    decomposition_type_pattern = re.compile(" *<[^>]*> *")
+
+    table = {}
+    all = []
+
+    # read everything we need into memory
+    for line in sys.stdin.readlines():
+        fields = line.split(";")
+        if len(fields) > 5:
+            # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
+            general_category = fields[2]
+            decomposition = fields[5]
+            decomposition = re.sub(decomposition_type_pattern, ' ', decomposition)
+            id = int(fields[0], 16)
+            combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""]
+            codepoint = Codepoint(id, general_category, combining_ids)
+            table[id] = codepoint
+            all.append(codepoint)
+
+    # walk through all the codepoints looking for interesting mappings
+    for codepoint in all:
+        if codepoint.general_category.startswith('L') and \
+           len(codepoint.combining_ids) > 1:
+            if is_letter_with_marks(codepoint, table):
+                print_record(codepoint.id,
+                             chr(get_plain_letter(codepoint, table).id))
+            elif expand_ligatures and is_ligature(codepoint, table):
+                print_record(codepoint.id,
+                             "".join(unichr(combining_codepoint.id)
+                                     for combining_codepoint \
+                                     in get_plain_letters(codepoint, table)))
+
+    # some special cases
+    print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
+    print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
+    print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
+    print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
+    print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
+    print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
+    print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
+    print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
+    print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
+    print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
+    print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
+    print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
+    print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
+    print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
+    if expand_ligatures:
+        print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
+        print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
+        print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
+        print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
+        print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
+
+if __name__ == "__main__":
+    main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
--- a/contrib/unaccent/unaccent.rules
+++ b/contrib/unaccent/unaccent.rules
@ -4,22 +4,59 @@
 Ã	A
 Ä	A
 Å	A
-Æ	A
+Ç	C
+È	E
+É	E
+Ê	E
+Ë	E
+Ì	I
+Í	I
+Î	I
+Ï	I
+Ñ	N
+Ò	O
+Ó	O
+Ô	O
+Õ	O
+Ö	O
+Ù	U
+Ú	U
+Û	U
+Ü	U
+Ý	Y
 à	a
 á	a
 â	a
 ã	a
 ä	a
 å	a
-æ	a
+ç	c
+è	e
+é	e
+ê	e
+ë	e
+ì	i
+í	i
+î	i
+ï	i
+ñ	n
+ò	o
+ó	o
+ô	o
+õ	o
+ö	o
+ù	u
+ú	u
+û	u
+ü	u
+ý	y
+ÿ	y
 Ā	A
 ā	a
 Ă	A
 ă	a
 Ą	A
 ą	a
-Ç	C
-ç	c
 Ć	C
 ć	c
 Ĉ	C
@ -30,16 +67,6 @@
 č	c
 Ď	D
 ď	d
-Đ	D
-đ	d
-È	E
-É	E
-Ê	E
-Ë	E
-è	e
-é	e
-ê	e
-ë	e
 Ē	E
 ē	e
 Ĕ	E
@ -60,17 +87,7 @@
 ģ	g
 Ĥ	H
 ĥ	h
-Ħ	H
-ħ	h
 Ĩ	I
-Ì	I
-Í	I
-Î	I
-Ï	I
-ì	i
-í	i
-î	i
-ï	i
 ĩ	i
 Ī	I
 ī	i
@ -79,62 +96,36 @@
 Į	I
 į	i
 İ	I
-ı	i
-Ĳ	I
-ĳ	i
+Ĳ	IJ
+ĳ	ij
 Ĵ	J
 ĵ	j
 Ķ	K
 ķ	k
-ĸ	k
 Ĺ	L
 ĺ	l
 Ļ	L
 ļ	l
 Ľ	L
 ľ	l
-Ŀ	L
-ŀ	l
-Ł	L
-ł	l
-Ñ	N
-ñ	n
 Ń	N
 ń	n
 Ņ	N
 ņ	n
 Ň	N
 ň	n
-ŉ	n
-Ŋ	N
-ŋ	n
-Ò	O
-Ó	O
-Ô	O
-Õ	O
-Ö	O
-ò	o
-ó	o
-ô	o
-õ	o
-ö	o
 Ō	O
 ō	o
 Ŏ	O
 ŏ	o
 Ő	O
 ő	o
-Œ	E
-œ	e
-Ø	O
-ø	o
 Ŕ	R
 ŕ	r
 Ŗ	R
 ŗ	r
 Ř	R
 ř	r
-ß	S
 Ś	S
 ś	s
 Ŝ	S
@ -147,16 +138,6 @@
 ţ	t
 Ť	T
 ť	t
-Ŧ	T
-ŧ	t
-Ù	U
-Ú	U
-Û	U
-Ü	U
-ù	u
-ú	u
-û	u
-ü	u
 Ũ	U
 ũ	u
 Ū	U
@ -171,9 +152,6 @@
 ų	u
 Ŵ	W
 ŵ	w
-Ý	Y
-ý	y
-ÿ	y
 Ŷ	Y
 ŷ	y
 Ÿ	Y
@ -183,5 +161,253 @@
 ż	z
 Ž	Z
 ž	z
-ё	е
+Ơ	O
+ơ	o
+Ư	U
+ư	u
+Ǆ	DZ
+ǅ	Dz
+ǆ	dz
+Ǉ	LJ
+ǈ	Lj
+ǉ	lj
+Ǌ	NJ
+ǋ	Nj
+ǌ	nj
+Ǎ	A
+ǎ	a
+Ǐ	I
+ǐ	i
+Ǒ	O
+ǒ	o
+Ǔ	U
+ǔ	u
+Ǧ	G
+ǧ	g
+Ǩ	K
+ǩ	k
+Ǫ	O
+ǫ	o
+ǰ	j
+Ǳ	DZ
+ǲ	Dz
+ǳ	dz
+Ǵ	G
+ǵ	g
+Ǹ	N
+ǹ	n
+Ȁ	A
+ȁ	a
+Ȃ	A
+ȃ	a
+Ȅ	E
+ȅ	e
+Ȇ	E
+ȇ	e
+Ȉ	I
+ȉ	i
+Ȋ	I
+ȋ	i
+Ȍ	O
+ȍ	o
+Ȏ	O
+ȏ	o
+Ȑ	R
+ȑ	r
+Ȓ	R
+ȓ	r
+Ȕ	U
+ȕ	u
+Ȗ	U
+ȗ	u
+Ș	S
+ș	s
+Ț	T
+ț	t
+Ȟ	H
+ȟ	h
+Ȧ	A
+ȧ	a
+Ȩ	E
+ȩ	e
+Ȯ	O
+ȯ	o
+Ȳ	Y
+ȳ	y
+Ḁ	A
+ḁ	a
+Ḃ	B
+ḃ	b
+Ḅ	B
+ḅ	b
+Ḇ	B
+ḇ	b
+Ḋ	D
+ḋ	d
+Ḍ	D
+ḍ	d
+Ḏ	D
+ḏ	d
+Ḑ	D
+ḑ	d
+Ḓ	D
+ḓ	d
+Ḙ	E
+ḙ	e
+Ḛ	E
+ḛ	e
+Ḟ	F
+ḟ	f
+Ḡ	G
+ḡ	g
+Ḣ	H
+ḣ	h
+Ḥ	H
+ḥ	h
+Ḧ	H
+ḧ	h
+Ḩ	H
+ḩ	h
+Ḫ	H
+ḫ	h
+Ḭ	I
+ḭ	i
+Ḱ	K
+ḱ	k
+Ḳ	K
+ḳ	k
+Ḵ	K
+ḵ	k
+Ḷ	L
+ḷ	l
+Ḻ	L
+ḻ	l
+Ḽ	L
+ḽ	l
+Ḿ	M
+ḿ	m
+Ṁ	M
+ṁ	m
+Ṃ	M
+ṃ	m
+Ṅ	N
+ṅ	n
+Ṇ	N
+ṇ	n
+Ṉ	N
+ṉ	n
+Ṋ	N
+ṋ	n
+Ṕ	P
+ṕ	p
+Ṗ	P
+ṗ	p
+Ṙ	R
+ṙ	r
+Ṛ	R
+ṛ	r
+Ṟ	R
+ṟ	r
+Ṡ	S
+ṡ	s
+Ṣ	S
+ṣ	s
+Ṫ	T
+ṫ	t
+Ṭ	T
+ṭ	t
+Ṯ	T
+ṯ	t
+Ṱ	T
+ṱ	t
+Ṳ	U
+ṳ	u
+Ṵ	U
+ṵ	u
+Ṷ	U
+ṷ	u
+Ṽ	V
+ṽ	v
+Ṿ	V
+ṿ	v
+Ẁ	W
+ẁ	w
+Ẃ	W
+ẃ	w
+Ẅ	W
+ẅ	w
+Ẇ	W
+ẇ	w
+Ẉ	W
+ẉ	w
+Ẋ	X
+ẋ	x
+Ẍ	X
+ẍ	x
+Ẏ	Y
+ẏ	y
+Ẑ	Z
+ẑ	z
+Ẓ	Z
+ẓ	z
+Ẕ	Z
+ẕ	z
+ẖ	h
+ẗ	t
+ẘ	w
+ẙ	y
+Ạ	A
+ạ	a
+Ả	A
+ả	a
+Ẹ	E
+ẹ	e
+Ẻ	E
+ẻ	e
+Ẽ	E
+ẽ	e
+Ỉ	I
+ỉ	i
+Ị	I
+ị	i
+Ọ	O
+ọ	o
+Ỏ	O
+ỏ	o
+Ụ	U
+ụ	u
+Ủ	U
+ủ	u
+Ỳ	Y
+ỳ	y
+Ỵ	Y
+ỵ	y
+Ỷ	Y
+ỷ	y
+Ỹ	Y
+ỹ	y
+ﬀ	ff
+ﬁ	fi
+ﬂ	fl
+ﬃ	ffi
+ﬄ	ffl
+ﬆ	st
+Ø	O
+ø	o
+Đ	D
+đ	d
+ı	i
+Ħ	H
+ħ	h
+Ł	L
+ł	l
+ŉ	'n
+Ŧ	T
+ŧ	t
 Ё	Е
+ё	е
+Æ	AE
+ß	ss
+æ	ae
+Œ	OE
+œ	oe