|
|
@ -61,8 +61,25 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case |
|
|
|
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA |
|
|
|
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA |
|
|
|
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA |
|
|
|
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Combining marks follow a "base" character, and result in a composite |
|
|
|
|
|
|
|
# character. Example: "U&'A\0300'"produces "À".There are three types of |
|
|
|
|
|
|
|
# combining marks: enclosing (Me), non-spacing combining (Mn), spacing |
|
|
|
|
|
|
|
# combining (Mc). We identify the ranges of marks we feel safe removing. |
|
|
|
|
|
|
|
# References: |
|
|
|
|
|
|
|
# https://en.wikipedia.org/wiki/Combining_character |
|
|
|
|
|
|
|
# https://www.unicode.org/charts/PDF/U0300.pdf |
|
|
|
|
|
|
|
# https://www.unicode.org/charts/PDF/U20D0.pdf |
|
|
|
|
|
|
|
COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA |
|
|
|
|
|
|
|
(0x20dd, 0x20E0), # Me: Symbols |
|
|
|
|
|
|
|
(0x20e2, 0x20e4),) # Me: Screen, keycap, triangle |
|
|
|
|
|
|
|
|
|
|
|
def print_record(codepoint, letter): |
|
|
|
def print_record(codepoint, letter): |
|
|
|
print (chr(codepoint) + "\t" + letter) |
|
|
|
if letter: |
|
|
|
|
|
|
|
output = chr(codepoint) + "\t" + letter |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
output = chr(codepoint) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(output) |
|
|
|
|
|
|
|
|
|
|
|
class Codepoint: |
|
|
|
class Codepoint: |
|
|
|
def __init__(self, id, general_category, combining_ids): |
|
|
|
def __init__(self, id, general_category, combining_ids): |
|
|
@ -70,6 +87,16 @@ class Codepoint: |
|
|
|
self.general_category = general_category |
|
|
|
self.general_category = general_category |
|
|
|
self.combining_ids = combining_ids |
|
|
|
self.combining_ids = combining_ids |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_mark_to_remove(codepoint): |
|
|
|
|
|
|
|
"""Return true if this is a combining mark to remove.""" |
|
|
|
|
|
|
|
if not is_mark(codepoint): |
|
|
|
|
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for begin, end in COMBINING_MARK_RANGES: |
|
|
|
|
|
|
|
if codepoint.id >= begin and codepoint.id <= end: |
|
|
|
|
|
|
|
return True |
|
|
|
|
|
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
def is_plain_letter(codepoint): |
|
|
|
def is_plain_letter(codepoint): |
|
|
|
"""Return true if codepoint represents a "plain letter".""" |
|
|
|
"""Return true if codepoint represents a "plain letter".""" |
|
|
|
for begin, end in PLAIN_LETTER_RANGES: |
|
|
|
for begin, end in PLAIN_LETTER_RANGES: |
|
|
@ -234,6 +261,8 @@ def main(args): |
|
|
|
"".join(chr(combining_codepoint.id) |
|
|
|
"".join(chr(combining_codepoint.id) |
|
|
|
for combining_codepoint \ |
|
|
|
for combining_codepoint \ |
|
|
|
in get_plain_letters(codepoint, table)))) |
|
|
|
in get_plain_letters(codepoint, table)))) |
|
|
|
|
|
|
|
elif is_mark_to_remove(codepoint): |
|
|
|
|
|
|
|
charactersSet.add((codepoint.id, None)) |
|
|
|
|
|
|
|
|
|
|
|
# add CLDR Latin-ASCII characters |
|
|
|
# add CLDR Latin-ASCII characters |
|
|
|
if not args.noLigaturesExpansion: |
|
|
|
if not args.noLigaturesExpansion: |
|
|
|