@ -38,10 +38,10 @@ sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
# For now we are being conservative by including only Latin and Greek. This
# could be extended in future based on feedback from people with relevant
# language knowledge.
PLAIN_LETTER_RANGES = ( ( ord ( ' a ' ) , ord ( ' z ' ) ) , # Latin lower case
( ord ( ' A ' ) , ord ( ' Z ' ) ) , # Latin upper case
( 0x03b1 , 0x03c9 ) , # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
( 0x0391 , 0x03a9 ) ) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
PLAIN_LETTER_RANGES = ( ( ord ( ' a ' ) , ord ( ' z ' ) ) , # Latin lower case
( ord ( ' A ' ) , ord ( ' Z ' ) ) , # Latin upper case
( 0x03b1 , 0x03c9 ) , # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
( 0x0391 , 0x03a9 ) ) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
# Combining marks follow a "base" character, and result in a composite
# character. Example: "U&'A\0300'"produces "À ".There are three types of
@ -51,9 +51,10 @@ PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
# https://en.wikipedia.org/wiki/Combining_character
# https://www.unicode.org/charts/PDF/U0300.pdf
# https://www.unicode.org/charts/PDF/U20D0.pdf
COMBINING_MARK_RANGES = ( ( 0x0300 , 0x0362 ) , # Mn: Accents, IPA
( 0x20dd , 0x20E0 ) , # Me: Symbols
( 0x20e2 , 0x20e4 ) , ) # Me: Screen, keycap, triangle
COMBINING_MARK_RANGES = ( ( 0x0300 , 0x0362 ) , # Mn: Accents, IPA
( 0x20dd , 0x20E0 ) , # Me: Symbols
( 0x20e2 , 0x20e4 ) , ) # Me: Screen, keycap, triangle
def print_record ( codepoint , letter ) :
if letter :
@ -63,12 +64,14 @@ def print_record(codepoint, letter):
print ( output )
class Codepoint :
def __init__ ( self , id , general_category , combining_ids ) :
self . id = id
self . general_category = general_category
self . combining_ids = combining_ids
def is_mark_to_remove ( codepoint ) :
""" Return true if this is a combining mark to remove. """
if not is_mark ( codepoint ) :
@ -79,17 +82,20 @@ def is_mark_to_remove(codepoint):
return True
return False
def is_plain_letter ( codepoint ) :
""" Return true if codepoint represents a " plain letter " . """
for begin , end in PLAIN_LETTER_RANGES :
if codepoint . id > = begin and codepoint . id < = end :
return True
if codepoint . id > = begin and codepoint . id < = end :
return True
return False
def is_mark ( codepoint ) :
""" Returns true for diacritical marks (combining codepoints). """
return codepoint . general_category in ( " Mn " , " Me " , " Mc " )
def is_letter_with_marks ( codepoint , table ) :
""" Returns true for letters combined with one or more marks. """
# See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
@ -105,16 +111,18 @@ def is_letter_with_marks(codepoint, table):
# Check if the base letter of this letter has marks.
codepoint_base = codepoint . combining_ids [ 0 ]
if ( is_plain_letter ( table [ codepoint_base ] ) is False and \
is_letter_with_marks ( table [ codepoint_base ] , table ) is False ) :
if is_plain_letter ( table [ codepoint_base ] ) is False and \
is_letter_with_marks ( table [ codepoint_base ] , table ) is False :
return False
return True
def is_letter ( codepoint , table ) :
""" Return true for letter with or without diacritical marks. """
return is_plain_letter ( codepoint ) or is_letter_with_marks ( codepoint , table )
def get_plain_letter ( codepoint , table ) :
""" Return the base codepoint without marks. If this codepoint has more
than one combining character , do a recursive lookup on the table to
@ -133,15 +141,18 @@ def get_plain_letter(codepoint, table):
# Should not come here
assert ( False )
def is_ligature ( codepoint , table ) :
""" Return true for letters combined with letters. """
return all ( is_letter ( table [ i ] , table ) for i in codepoint . combining_ids )
def get_plain_letters ( codepoint , table ) :
""" Return a list of plain letters from a ligature. """
assert ( is_ligature ( codepoint , table ) )
return [ get_plain_letter ( table [ id ] , table ) for id in codepoint . combining_ids ]
def parse_cldr_latin_ascii_transliterator ( latinAsciiFilePath ) :
""" Parse the XML file and return a set of tuples (src, trg), where " src "
is the original character and " trg " the substitute . """
@ -189,21 +200,23 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
return charactersSet
def special_cases ( ) :
""" Returns the special cases which are not handled by other methods """
charactersSet = set ( )
# Cyrillic
charactersSet . add ( ( 0x0401 , " \u0415 " ) ) # CYRILLIC CAPITAL LETTER IO
charactersSet . add ( ( 0x0451 , " \u0435 " ) ) # CYRILLIC SMALL LETTER IO
charactersSet . add ( ( 0x0401 , " \u0415 " ) ) # CYRILLIC CAPITAL LETTER IO
charactersSet . add ( ( 0x0451 , " \u0435 " ) ) # CYRILLIC SMALL LETTER IO
# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
charactersSet . add ( ( 0x2103 , " \xb0 C " ) ) # DEGREE CELSIUS
charactersSet . add ( ( 0x2109 , " \xb0 F " ) ) # DEGREE FAHRENHEIT
charactersSet . add ( ( 0x2117 , " (P) " ) ) # SOUND RECORDING COPYRIGHT
charactersSet . add ( ( 0x2103 , " \xb0 C " ) ) # DEGREE CELSIUS
charactersSet . add ( ( 0x2109 , " \xb0 F " ) ) # DEGREE FAHRENHEIT
charactersSet . add ( ( 0x2117 , " (P) " ) ) # SOUND RECORDING COPYRIGHT
return charactersSet
def main ( args ) :
# https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
decomposition_type_pattern = re . compile ( " *<[^>]*> * " )
@ -238,12 +251,12 @@ def main(args):
len ( codepoint . combining_ids ) > 1 :
if is_letter_with_marks ( codepoint , table ) :
charactersSet . add ( ( codepoint . id ,
chr ( get_plain_letter ( codepoint , table ) . id ) ) )
chr ( get_plain_letter ( codepoint , table ) . id ) ) )
elif args . noLigaturesExpansion is False and is_ligature ( codepoint , table ) :
charactersSet . add ( ( codepoint . id ,
" " . join ( chr ( combining_codepoint . id )
for combining_codepoint \
in get_plain_letters ( codepoint , table ) ) ) )
" " . join ( chr ( combining_codepoint . id )
for combining_codepoint
in get_plain_letters ( codepoint , table ) ) ) )
elif is_mark_to_remove ( codepoint ) :
charactersSet . add ( ( codepoint . id , None ) )
@ -258,6 +271,7 @@ def main(args):
for characterPair in charactersList :
print_record ( characterPair [ 0 ] , characterPair [ 1 ] )
if __name__ == " __main__ " :
parser = argparse . ArgumentParser ( description = ' This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments. ' )
parser . add_argument ( " --unicode-data-file " , help = " Path to formatted text file corresponding to UnicodeData.txt. " , type = str , required = True , dest = ' unicodeDataFilePath ' )