1- #!/usr/bin/python2
1+ #!/usr/bin/python
22# -*- coding: utf-8 -*-
33#
44# This script builds unaccent.rules on standard output when given the
2323# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
2424# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
2525
26+ # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
27+ # The approach is to be Python3 compatible with Python2 "backports".
28+ from __future__ import print_function
29+ from __future__ import unicode_literals
30+ import codecs
31+ import sys
32+
33+ if sys .version_info [0 ] <= 2 :
34+ # Encode stdout as UTF-8, so we can just print to it
35+ sys .stdout = codecs .getwriter ('utf8' )(sys .stdout )
36+
37+ # Map Python 2's chr to unichr
38+ chr = unichr
39+
40+ # Python 2 and 3 compatible bytes call
41+ def bytes (source , encoding = 'ascii' , errors = 'strict' ):
42+ return source .encode (encoding = encoding , errors = errors )
43+ # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
2644
2745import re
2846import argparse
3957 (0x0391 , 0x03a9 )) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
4058
4159def print_record (codepoint , letter ):
42- print (unichr (codepoint ) + "\t " + letter ). encode ( "UTF-8" )
60+ print (chr (codepoint ) + "\t " + letter )
4361
4462class Codepoint :
4563 def __init__ (self , id , general_category , combining_ids ):
@@ -116,7 +134,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
116134 charactersSet = set ()
117135
118136 # RegEx to parse rules
119- rulePattern = re .compile (ur '^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;' )
137+ rulePattern = re .compile (r '^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;' )
120138
121139 # construct tree from XML
122140 transliterationTree = ET .parse (latinAsciiFilePath )
@@ -134,7 +152,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
134152 # Group 3: plain "trg" char. Empty if group 4 is not.
135153 # Group 4: plain "trg" char between quotes. Empty if group 3 is not.
136154 if matches is not None :
137- src = matches .group (1 ) if matches .group (1 ) is not None else matches .group (2 ).decode ('unicode-escape' )
155+ src = matches .group (1 ) if matches .group (1 ) is not None else bytes ( matches .group (2 ), 'UTF-8' ).decode ('unicode-escape' )
138156 trg = matches .group (3 ) if matches .group (3 ) is not None else matches .group (4 )
139157
140158 # "'" and """ are escaped
@@ -195,10 +213,10 @@ def main(args):
195213 len (codepoint .combining_ids ) > 1 :
196214 if is_letter_with_marks (codepoint , table ):
197215 charactersSet .add ((codepoint .id ,
198- unichr (get_plain_letter (codepoint , table ).id )))
216+ chr (get_plain_letter (codepoint , table ).id )))
199217 elif args .noLigaturesExpansion is False and is_ligature (codepoint , table ):
200218 charactersSet .add ((codepoint .id ,
201- "" .join (unichr (combining_codepoint .id )
219+ "" .join (chr (combining_codepoint .id )
202220 for combining_codepoint \
203221 in get_plain_letters (codepoint , table ))))
204222
0 commit comments