3838# For now we are being conservative by including only Latin and Greek. This
3939# could be extended in future based on feedback from people with relevant
4040# language knowledge.
41- PLAIN_LETTER_RANGES = ((ord ('a' ), ord ('z' )), # Latin lower case
42- (ord ('A' ), ord ('Z' )), # Latin upper case
43- (0x03b1 , 0x03c9 ), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
44- (0x0391 , 0x03a9 )) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
41+ PLAIN_LETTER_RANGES = ((ord ('a' ), ord ('z' )), # Latin lower case
42+ (ord ('A' ), ord ('Z' )), # Latin upper case
43+ (0x03b1 , 0x03c9 ), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
44+ (0x0391 , 0x03a9 )) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
4545
4646# Combining marks follow a "base" character, and result in a composite
4747# character. Example: "U&'A\0300'"produces "À".There are three types of
5151# https://en.wikipedia.org/wiki/Combining_character
5252# https://www.unicode.org/charts/PDF/U0300.pdf
5353# https://www.unicode.org/charts/PDF/U20D0.pdf
54- COMBINING_MARK_RANGES = ((0x0300 , 0x0362 ), # Mn: Accents, IPA
55- (0x20dd , 0x20E0 ), # Me: Symbols
56- (0x20e2 , 0x20e4 ),) # Me: Screen, keycap, triangle
54+ COMBINING_MARK_RANGES = ((0x0300 , 0x0362 ), # Mn: Accents, IPA
55+ (0x20dd , 0x20E0 ), # Me: Symbols
56+ (0x20e2 , 0x20e4 ),) # Me: Screen, keycap, triangle
57+
5758
5859def print_record (codepoint , letter ):
5960 if letter :
@@ -63,12 +64,14 @@ def print_record(codepoint, letter):
6364
6465 print (output )
6566
67+
6668class Codepoint :
6769 def __init__ (self , id , general_category , combining_ids ):
6870 self .id = id
6971 self .general_category = general_category
7072 self .combining_ids = combining_ids
7173
74+
7275def is_mark_to_remove (codepoint ):
7376 """Return true if this is a combining mark to remove."""
7477 if not is_mark (codepoint ):
@@ -79,17 +82,20 @@ def is_mark_to_remove(codepoint):
7982 return True
8083 return False
8184
85+
8286def is_plain_letter (codepoint ):
8387 """Return true if codepoint represents a "plain letter"."""
8488 for begin , end in PLAIN_LETTER_RANGES :
85- if codepoint .id >= begin and codepoint .id <= end :
86- return True
89+ if codepoint .id >= begin and codepoint .id <= end :
90+ return True
8791 return False
8892
93+
8994def is_mark (codepoint ):
9095 """Returns true for diacritical marks (combining codepoints)."""
9196 return codepoint .general_category in ("Mn" , "Me" , "Mc" )
9297
98+
9399def is_letter_with_marks (codepoint , table ):
94100 """Returns true for letters combined with one or more marks."""
95101 # See https://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
@@ -105,16 +111,18 @@ def is_letter_with_marks(codepoint, table):
105111
106112 # Check if the base letter of this letter has marks.
107113 codepoint_base = codepoint .combining_ids [0 ]
108- if ( is_plain_letter (table [codepoint_base ]) is False and \
109- is_letter_with_marks (table [codepoint_base ], table ) is False ) :
114+ if is_plain_letter (table [codepoint_base ]) is False and \
115+ is_letter_with_marks (table [codepoint_base ], table ) is False :
110116 return False
111117
112118 return True
113119
120+
114121def is_letter (codepoint , table ):
115122 """Return true for letter with or without diacritical marks."""
116123 return is_plain_letter (codepoint ) or is_letter_with_marks (codepoint , table )
117124
125+
118126def get_plain_letter (codepoint , table ):
119127 """Return the base codepoint without marks. If this codepoint has more
120128 than one combining character, do a recursive lookup on the table to
@@ -133,15 +141,18 @@ def get_plain_letter(codepoint, table):
133141 # Should not come here
134142 assert (False )
135143
144+
136145def is_ligature (codepoint , table ):
137146 """Return true for letters combined with letters."""
138147 return all (is_letter (table [i ], table ) for i in codepoint .combining_ids )
139148
149+
140150def get_plain_letters (codepoint , table ):
141151 """Return a list of plain letters from a ligature."""
142152 assert (is_ligature (codepoint , table ))
143153 return [get_plain_letter (table [id ], table ) for id in codepoint .combining_ids ]
144154
155+
145156def parse_cldr_latin_ascii_transliterator (latinAsciiFilePath ):
146157 """Parse the XML file and return a set of tuples (src, trg), where "src"
147158 is the original character and "trg" the substitute."""
@@ -189,21 +200,23 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
189200
190201 return charactersSet
191202
203+
192204def special_cases ():
193205 """Returns the special cases which are not handled by other methods"""
194206 charactersSet = set ()
195207
196208 # Cyrillic
197- charactersSet .add ((0x0401 , "\u0415 " )) # CYRILLIC CAPITAL LETTER IO
198- charactersSet .add ((0x0451 , "\u0435 " )) # CYRILLIC SMALL LETTER IO
209+ charactersSet .add ((0x0401 , "\u0415 " )) # CYRILLIC CAPITAL LETTER IO
210+ charactersSet .add ((0x0451 , "\u0435 " )) # CYRILLIC SMALL LETTER IO
199211
200212 # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
201- charactersSet .add ((0x2103 , "\xb0 C" )) # DEGREE CELSIUS
202- charactersSet .add ((0x2109 , "\xb0 F" )) # DEGREE FAHRENHEIT
203- charactersSet .add ((0x2117 , "(P)" )) # SOUND RECORDING COPYRIGHT
213+ charactersSet .add ((0x2103 , "\xb0 C" )) # DEGREE CELSIUS
214+ charactersSet .add ((0x2109 , "\xb0 F" )) # DEGREE FAHRENHEIT
215+ charactersSet .add ((0x2117 , "(P)" )) # SOUND RECORDING COPYRIGHT
204216
205217 return charactersSet
206218
219+
207220def main (args ):
208221 # https://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
209222 decomposition_type_pattern = re .compile (" *<[^>]*> *" )
@@ -238,12 +251,12 @@ def main(args):
238251 len (codepoint .combining_ids ) > 1 :
239252 if is_letter_with_marks (codepoint , table ):
240253 charactersSet .add ((codepoint .id ,
241- chr (get_plain_letter (codepoint , table ).id )))
254+ chr (get_plain_letter (codepoint , table ).id )))
242255 elif args .noLigaturesExpansion is False and is_ligature (codepoint , table ):
243256 charactersSet .add ((codepoint .id ,
244- "" .join (chr (combining_codepoint .id )
245- for combining_codepoint \
246- in get_plain_letters (codepoint , table ))))
257+ "" .join (chr (combining_codepoint .id )
258+ for combining_codepoint
259+ in get_plain_letters (codepoint , table ))))
247260 elif is_mark_to_remove (codepoint ):
248261 charactersSet .add ((codepoint .id , None ))
249262
@@ -258,6 +271,7 @@ def main(args):
258271 for characterPair in charactersList :
259272 print_record (characterPair [0 ], characterPair [1 ])
260273
274+
261275if __name__ == "__main__" :
262276 parser = argparse .ArgumentParser (description = 'This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.' )
263277 parser .add_argument ("--unicode-data-file" , help = "Path to formatted text file corresponding to UnicodeData.txt." , type = str , required = True , dest = 'unicodeDataFilePath' )
0 commit comments