1- #!/usr/bin/python
1+ #!/usr/bin/python2
2+ # -*- coding: utf-8 -*-
23#
34# This script builds unaccent.rules on standard output when given the
4- # contents of UnicodeData.txt[1] on standard input. Optionally includes
5- # ligature expansion, if --expand-ligatures is given on the command line.
5+ # contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
6+ # arguments. Optionally includes ligature expansion and Unicode CLDR
7+ # Latin-ASCII transliterator, enabled by default, this can be disabled
8+ # with "--no-ligatures-expansion" command line option.
69#
710# The approach is to use the Unicode decomposition data to identify
811# precomposed codepoints that are equivalent to a ligature of several
912# letters, or a base letter with any number of diacritical marks.
10- # There is also a small set of special cases for codepoints that we
11- # traditionally support even though Unicode doesn't consider them to
12- # be ligatures or letters with marks.
1313#
14- # [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
14+ # This approach handles most letters with diacritical marks and some
15+ # ligatures. However, several characters (notably a majority of
16+ # ligatures) don't have decomposition. To handle all these cases, one can
17+ # use a standard Unicode transliterator available in Common Locale Data
18+ # Repository (CLDR): Latin-ASCII. This transliterator associates Unicode
19+ # characters to ASCII-range equivalent. Unless "--no-ligatures-expansion"
20+ # option is enabled, the XML file of this transliterator [2] -- given as a
21+ # command line argument -- will be parsed and used.
22+ #
23+ # [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
24+ # [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
25+
1526
1627import re
28+ import argparse
1729import sys
30+ import xml .etree .ElementTree as ET
1831
1932def print_record (codepoint , letter ):
2033 print (unichr (codepoint ) + "\t " + letter ).encode ("UTF-8" )
@@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table):
6376 assert (is_ligature (codepoint , table ))
6477 return [get_plain_letter (table [id ], table ) for id in codepoint .combining_ids ]
6578
66- def main (expand_ligatures ):
79+ def parse_cldr_latin_ascii_transliterator (latinAsciiFilePath ):
80+ """Parse the XML file and return a set of tuples (src, trg), where "src"
81+ is the original character and "trg" the substitute."""
82+ charactersSet = set ()
83+
84+ # RegEx to parse rules
85+ rulePattern = re .compile (ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;' )
86+
87+ # construct tree from XML
88+ transliterationTree = ET .parse (latinAsciiFilePath )
89+ transliterationTreeRoot = transliterationTree .getroot ()
90+
91+ for rule in transliterationTreeRoot .findall ("./transforms/transform/tRule" ):
92+ matches = rulePattern .search (rule .text )
93+
94+ # The regular expression capture four groups corresponding
95+ # to the characters.
96+ #
97+ # Group 1: plain "src" char. Empty if group 2 is not.
98+ # Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not.
99+ #
100+ # Group 3: plain "trg" char. Empty if group 4 is not.
101+ # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
102+ if matches is not None :
103+ src = matches .group (1 ) if matches .group (1 ) is not None else matches .group (2 ).decode ('unicode-escape' )
104+ trg = matches .group (3 ) if matches .group (3 ) is not None else matches .group (4 )
105+
106+ # "'" and """ are escaped
107+ trg = trg .replace ("\\ '" , "'" ).replace ('\\ "' , '"' )
108+
109+ # the parser of unaccent only accepts non-whitespace characters
110+ # for "src" and "trg" (see unaccent.c)
111+ if not src .isspace () and not trg .isspace ():
112+ charactersSet .add ((ord (src ), trg ))
113+
114+ return charactersSet
115+
116+ def special_cases ():
117+ """Returns the special cases which are not handled by other methods"""
118+ charactersSet = set ()
119+
120+ # Cyrillic
121+ charactersSet .add ((0x0401 , u"\u0415 " )) # CYRILLIC CAPITAL LETTER IO
122+ charactersSet .add ((0x0451 , u"\u0435 " )) # CYRILLIC SMALL LETTER IO
123+
124+ # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
125+ charactersSet .add ((0x2103 , u"\xb0 C" )) # DEGREE CELSIUS
126+ charactersSet .add ((0x2109 , u"\xb0 F" )) # DEGREE FAHRENHEIT
127+ charactersSet .add ((0x2117 , "(P)" )) # SOUND RECORDING COPYRIGHT
128+
129+ return charactersSet
130+
131+ def main (args ):
67132 # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
68133 decomposition_type_pattern = re .compile (" *<[^>]*> *" )
69134
70135 table = {}
71136 all = []
72137
138+ # unordered set for ensure uniqueness
139+ charactersSet = set ()
140+
141+ # read file UnicodeData.txt
142+ unicodeDataFile = open (args .unicodeDataFilePath , 'r' )
143+
73144 # read everything we need into memory
74- for line in sys . stdin . readlines () :
145+ for line in unicodeDataFile :
75146 fields = line .split (";" )
76147 if len (fields ) > 5 :
77148 # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
@@ -89,35 +160,34 @@ def main(expand_ligatures):
89160 if codepoint .general_category .startswith ('L' ) and \
90161 len (codepoint .combining_ids ) > 1 :
91162 if is_letter_with_marks (codepoint , table ):
92- print_record (codepoint .id ,
93- chr (get_plain_letter (codepoint , table ).id ))
94- elif expand_ligatures and is_ligature (codepoint , table ):
95- print_record (codepoint .id ,
163+ charactersSet . add ( (codepoint .id ,
164+ chr (get_plain_letter (codepoint , table ).id )))
165+ elif args . noLigaturesExpansion is False and is_ligature (codepoint , table ):
166+ charactersSet . add ( (codepoint .id ,
96167 "" .join (unichr (combining_codepoint .id )
97168 for combining_codepoint \
98- in get_plain_letters (codepoint , table )))
99-
100- # some special cases
101- print_record (0x00d8 , "O" ) # LATIN CAPITAL LETTER O WITH STROKE
102- print_record (0x00f8 , "o" ) # LATIN SMALL LETTER O WITH STROKE
103- print_record (0x0110 , "D" ) # LATIN CAPITAL LETTER D WITH STROKE
104- print_record (0x0111 , "d" ) # LATIN SMALL LETTER D WITH STROKE
105- print_record (0x0131 , "i" ) # LATIN SMALL LETTER DOTLESS I
106- print_record (0x0126 , "H" ) # LATIN CAPITAL LETTER H WITH STROKE
107- print_record (0x0127 , "h" ) # LATIN SMALL LETTER H WITH STROKE
108- print_record (0x0141 , "L" ) # LATIN CAPITAL LETTER L WITH STROKE
109- print_record (0x0142 , "l" ) # LATIN SMALL LETTER L WITH STROKE
110- print_record (0x0149 , "'n" ) # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
111- print_record (0x0166 , "T" ) # LATIN CAPITAL LETTER T WITH STROKE
112- print_record (0x0167 , "t" ) # LATIN SMALL LETTER t WITH STROKE
113- print_record (0x0401 , u"\u0415 " ) # CYRILLIC CAPITAL LETTER IO
114- print_record (0x0451 , u"\u0435 " ) # CYRILLIC SMALL LETTER IO
115- if expand_ligatures :
116- print_record (0x00c6 , "AE" ) # LATIN CAPITAL LETTER AE
117- print_record (0x00df , "ss" ) # LATIN SMALL LETTER SHARP S
118- print_record (0x00e6 , "ae" ) # LATIN SMALL LETTER AE
119- print_record (0x0152 , "OE" ) # LATIN CAPITAL LIGATURE OE
120- print_record (0x0153 , "oe" ) # LATIN SMALL LIGATURE OE
169+ in get_plain_letters (codepoint , table ))))
170+
171+ # add CLDR Latin-ASCII characters
172+ if not args .noLigaturesExpansion :
173+ charactersSet |= parse_cldr_latin_ascii_transliterator (args .latinAsciiFilePath )
174+ charactersSet |= special_cases ()
175+
176+ # sort for more convenient display
177+ charactersList = sorted (charactersSet , key = lambda characterPair : characterPair [0 ])
178+
179+ for characterPair in charactersList :
180+ print_record (characterPair [0 ], characterPair [1 ])
121181
122182if __name__ == "__main__" :
123- main (len (sys .argv ) == 2 and sys .argv [1 ] == "--expand-ligatures" )
183+ parser = argparse .ArgumentParser (description = 'This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.' )
184+ parser .add_argument ("--unicode-data-file" , help = "Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>." , type = str , required = True , dest = 'unicodeDataFilePath' )
185+ parser .add_argument ("--latin-ascii-file" , help = "Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>." , type = str , dest = 'latinAsciiFilePath' )
186+ parser .add_argument ("--no-ligatures-expansion" , help = "Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \" --latin-ascii-file\" argument is required. If this option is enabled, \" --latin-ascii-file\" argument is optional and ignored." , action = "store_true" , dest = 'noLigaturesExpansion' )
187+ args = parser .parse_args ()
188+
189+ if args .noLigaturesExpansion is False and args .latinAsciiFilePath is None :
190+ sys .stderr .write ('You must specify the path to Latin-ASCII transliterator file with \" --latin-ascii-file\" option or use \" --no-ligatures-expansion\" option. Use \" -h\" option for help.' )
191+ sys .exit (1 )
192+
193+ main (args )
0 commit comments