|
32 | 32 | # The approach is to be Python3 compatible with Python2 "backports". |
33 | 33 | from __future__ import print_function |
34 | 34 | from __future__ import unicode_literals |
| 35 | +# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped |
| 36 | + |
| 37 | +import argparse |
35 | 38 | import codecs |
| 39 | +import re |
36 | 40 | import sys |
| 41 | +import xml.etree.ElementTree as ET |
37 | 42 |
|
| 43 | +# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped |
38 | 44 | if sys.version_info[0] <= 2: |
39 | 45 | # Encode stdout as UTF-8, so we can just print to it |
40 | 46 | sys.stdout = codecs.getwriter('utf8')(sys.stdout) |
|
45 | 51 | # Python 2 and 3 compatible bytes call |
46 | 52 | def bytes(source, encoding='ascii', errors='strict'): |
47 | 53 | return source.encode(encoding=encoding, errors=errors) |
| 54 | +else: |
48 | 55 | # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped |
49 | | - |
50 | | -import re |
51 | | -import argparse |
52 | | -import sys |
53 | | -import xml.etree.ElementTree as ET |
| 56 | + sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer) |
54 | 57 |
|
55 | 58 | # The ranges of Unicode characters that we consider to be "plain letters". |
56 | 59 | # For now we are being conservative by including only Latin and Greek. This |
@@ -233,21 +236,22 @@ def main(args): |
233 | 236 | charactersSet = set() |
234 | 237 |
|
235 | 238 | # read file UnicodeData.txt |
236 | | - unicodeDataFile = open(args.unicodeDataFilePath, 'r') |
237 | | - |
238 | | - # read everything we need into memory |
239 | | - for line in unicodeDataFile: |
240 | | - fields = line.split(";") |
241 | | - if len(fields) > 5: |
242 | | - # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt |
243 | | - general_category = fields[2] |
244 | | - decomposition = fields[5] |
245 | | - decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) |
246 | | - id = int(fields[0], 16) |
247 | | - combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] |
248 | | - codepoint = Codepoint(id, general_category, combining_ids) |
249 | | - table[id] = codepoint |
250 | | - all.append(codepoint) |
| 239 | + with codecs.open( |
| 240 | + args.unicodeDataFilePath, mode='r', encoding='UTF-8', |
| 241 | + ) as unicodeDataFile: |
| 242 | + # read everything we need into memory |
| 243 | + for line in unicodeDataFile: |
| 244 | + fields = line.split(";") |
| 245 | + if len(fields) > 5: |
| 246 | + # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt |
| 247 | + general_category = fields[2] |
| 248 | + decomposition = fields[5] |
| 249 | + decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) |
| 250 | + id = int(fields[0], 16) |
| 251 | + combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] |
| 252 | + codepoint = Codepoint(id, general_category, combining_ids) |
| 253 | + table[id] = codepoint |
| 254 | + all.append(codepoint) |
251 | 255 |
|
252 | 256 | # walk through all the codepoints looking for interesting mappings |
253 | 257 | for codepoint in all: |
|
0 commit comments