|
10 | 10 | import locale |
11 | 11 | import os |
12 | 12 | import sys |
13 | | -import codecs |
14 | 13 |
|
15 | 14 |
|
16 | 15 | from gitdb.utils.compat import ( |
@@ -91,181 +90,3 @@ def __str__(self): |
91 | 90 | else: # Python 2 |
92 | 91 | def __str__(self): |
93 | 92 | return self.__unicode__().encode(defenc) |
94 | | - |
95 | | - |
96 | | -""" |
97 | | -This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error |
98 | | -handler of Python 3. |
99 | | -Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc |
100 | | -""" |
101 | | - |
102 | | -# This code is released under the Python license and the BSD 2-clause license |
103 | | - |
104 | | - |
105 | | -FS_ERRORS = 'surrogateescape' |
106 | | - |
107 | | -# # -- Python 2/3 compatibility ------------------------------------- |
108 | | -# FS_ERRORS = 'my_surrogateescape' |
109 | | - |
110 | | -def u(text): |
111 | | - if PY3: |
112 | | - return text |
113 | | - return text.decode('unicode_escape') |
114 | | - |
115 | | -def b(data): |
116 | | - if PY3: |
117 | | - return data.encode('latin1') |
118 | | - return data |
119 | | - |
120 | | -def surrogateescape_handler(exc): |
121 | | - """ |
122 | | - Pure Python implementation of the PEP 383: the "surrogateescape" error |
123 | | - handler of Python 3. Undecodable bytes will be replaced by a Unicode |
124 | | - character U+DCxx on decoding, and these are translated into the |
125 | | - original bytes on encoding. |
126 | | - """ |
127 | | - mystring = exc.object[exc.start:exc.end] |
128 | | - |
129 | | - try: |
130 | | - if isinstance(exc, UnicodeDecodeError): |
131 | | - # mystring is a byte-string in this case |
132 | | - decoded = replace_surrogate_decode(mystring) |
133 | | - elif isinstance(exc, UnicodeEncodeError): |
134 | | - # In the case of u'\udcc3'.encode('ascii', |
135 | | - # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an |
136 | | - # exception anyway after this function is called, even though I think |
137 | | - # it's doing what it should. It seems that the strict encoder is called |
138 | | - # to encode the unicode string that this function returns ... |
139 | | - decoded = replace_surrogate_encode(mystring, exc) |
140 | | - else: |
141 | | - raise exc |
142 | | - except NotASurrogateError: |
143 | | - raise exc |
144 | | - return (decoded, exc.end) |
145 | | - |
146 | | - |
147 | | -class NotASurrogateError(Exception): |
148 | | - pass |
149 | | - |
150 | | - |
151 | | -def replace_surrogate_encode(mystring, exc): |
152 | | - """ |
153 | | - Returns a (unicode) string, not the more logical bytes, because the codecs |
154 | | - register_error functionality expects this. |
155 | | - """ |
156 | | - decoded = [] |
157 | | - for ch in mystring: |
158 | | - # if PY3: |
159 | | - # code = ch |
160 | | - # else: |
161 | | - code = ord(ch) |
162 | | - |
163 | | - # The following magic comes from Py3.3's Python/codecs.c file: |
164 | | - if not 0xD800 <= code <= 0xDCFF: |
165 | | - # Not a surrogate. Fail with the original exception. |
166 | | - raise exc |
167 | | - # mybytes = [0xe0 | (code >> 12), |
168 | | - # 0x80 | ((code >> 6) & 0x3f), |
169 | | - # 0x80 | (code & 0x3f)] |
170 | | - # Is this a good idea? |
171 | | - if 0xDC00 <= code <= 0xDC7F: |
172 | | - decoded.append(chr(code - 0xDC00)) |
173 | | - elif code <= 0xDCFF: |
174 | | - decoded.append(chr(code - 0xDC00)) |
175 | | - else: |
176 | | - raise NotASurrogateError |
177 | | - return str().join(decoded) |
178 | | - |
179 | | - |
180 | | -def replace_surrogate_decode(mybytes): |
181 | | - """ |
182 | | - Returns a (unicode) string |
183 | | - """ |
184 | | - decoded = [] |
185 | | - for ch in mybytes: |
186 | | - # We may be parsing newbytes (in which case ch is an int) or a native |
187 | | - # str on Py2 |
188 | | - if isinstance(ch, int): |
189 | | - code = ch |
190 | | - else: |
191 | | - code = ord(ch) |
192 | | - if 0x80 <= code <= 0xFF: |
193 | | - decoded.append(chr(0xDC00 + code)) |
194 | | - elif code <= 0x7F: |
195 | | - decoded.append(chr(code)) |
196 | | - else: |
197 | | - # # It may be a bad byte |
198 | | - # # Try swallowing it. |
199 | | - # continue |
200 | | - # print("RAISE!") |
201 | | - raise NotASurrogateError |
202 | | - return str().join(decoded) |
203 | | - |
204 | | - |
205 | | -def encodefilename(fn): |
206 | | - if FS_ENCODING == 'ascii': |
207 | | - # ASCII encoder of Python 2 expects that the error handler returns a |
208 | | - # Unicode string encodable to ASCII, whereas our surrogateescape error |
209 | | - # handler has to return bytes in 0x80-0xFF range. |
210 | | - encoded = [] |
211 | | - for index, ch in enumerate(fn): |
212 | | - code = ord(ch) |
213 | | - if code < 128: |
214 | | - ch = bytes((code,)) |
215 | | - elif 0xDC80 <= code <= 0xDCFF: |
216 | | - ch = bytes((code - 0xDC00,)) |
217 | | - else: |
218 | | - raise UnicodeEncodeError(FS_ENCODING, |
219 | | - fn, index, index+1, |
220 | | - 'ordinal not in range(128)') |
221 | | - encoded.append(ch) |
222 | | - return bytes().join(encoded) |
223 | | - elif FS_ENCODING == 'utf-8': |
224 | | - # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF |
225 | | - # doesn't go through our error handler |
226 | | - encoded = [] |
227 | | - for index, ch in enumerate(fn): |
228 | | - code = ord(ch) |
229 | | - if 0xD800 <= code <= 0xDFFF: |
230 | | - if 0xDC80 <= code <= 0xDCFF: |
231 | | - ch = bytes((code - 0xDC00,)) |
232 | | - encoded.append(ch) |
233 | | - else: |
234 | | - raise UnicodeEncodeError( |
235 | | - FS_ENCODING, |
236 | | - fn, index, index+1, 'surrogates not allowed') |
237 | | - else: |
238 | | - ch_utf8 = ch.encode('utf-8') |
239 | | - encoded.append(ch_utf8) |
240 | | - return bytes().join(encoded) |
241 | | - return fn.encode(FS_ENCODING, FS_ERRORS) |
242 | | - |
243 | | -def decodefilename(fn): |
244 | | - return fn.decode(FS_ENCODING, FS_ERRORS) |
245 | | - |
246 | | -FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
247 | | -# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') |
248 | | -# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') |
249 | | - |
250 | | - |
251 | | -# normalize the filesystem encoding name. |
252 | | -# For example, we expect "utf-8", not "UTF8". |
253 | | -FS_ENCODING = codecs.lookup(FS_ENCODING).name |
254 | | - |
255 | | - |
256 | | -def register_surrogateescape(): |
257 | | - """ |
258 | | - Registers the surrogateescape error handler on Python 2 (only) |
259 | | - """ |
260 | | - if PY3: |
261 | | - return |
262 | | - try: |
263 | | - codecs.lookup_error(FS_ERRORS) |
264 | | - except LookupError: |
265 | | - codecs.register_error(FS_ERRORS, surrogateescape_handler) |
266 | | - |
267 | | - |
268 | | -try: |
269 | | - b"100644 \x9f\0aaa".decode(defenc, "surrogateescape") |
270 | | -except Exception: |
271 | | - register_surrogateescape() |
0 commit comments