1010import locale
1111import os
1212import sys
13+ import codecs
14+
1315
1416from gitdb .utils .compat import (
1517 xrange ,
@@ -67,7 +69,7 @@ def safe_decode(s):
6769 if isinstance (s , unicode ):
6870 return s
6971 elif isinstance (s , bytes ):
70- return s .decode (defenc , 'replace ' )
72+ return s .decode (defenc , 'surrogateescape ' )
7173 elif s is not None :
7274 raise TypeError ('Expected bytes or text, but got %r' % (s ,))
7375
@@ -121,3 +123,191 @@ def __str__(self):
121123 else : # Python 2
122124 def __str__ (self ):
123125 return self .__unicode__ ().encode (defenc )
126+
127+
128+ """
129+ This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
130+ handler of Python 3.
131+ Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
132+ """
133+
134+ # This code is released under the Python license and the BSD 2-clause license
135+
136+
137+ FS_ERRORS = 'surrogateescape'
138+
139+ # # -- Python 2/3 compatibility -------------------------------------
140+ # FS_ERRORS = 'my_surrogateescape'
141+
142+ def u (text ):
143+ if PY3 :
144+ return text
145+ else :
146+ return text .decode ('unicode_escape' )
147+
148+ def b (data ):
149+ if PY3 :
150+ return data .encode ('latin1' )
151+ else :
152+ return data
153+
154+ if PY3 :
155+ _unichr = chr
156+ bytes_chr = lambda code : bytes ((code ,))
157+ else :
158+ _unichr = unichr
159+ bytes_chr = chr
160+
161+ def surrogateescape_handler (exc ):
162+ """
163+ Pure Python implementation of the PEP 383: the "surrogateescape" error
164+ handler of Python 3. Undecodable bytes will be replaced by a Unicode
165+ character U+DCxx on decoding, and these are translated into the
166+ original bytes on encoding.
167+ """
168+ mystring = exc .object [exc .start :exc .end ]
169+
170+ try :
171+ if isinstance (exc , UnicodeDecodeError ):
172+ # mystring is a byte-string in this case
173+ decoded = replace_surrogate_decode (mystring )
174+ elif isinstance (exc , UnicodeEncodeError ):
175+ # In the case of u'\udcc3'.encode('ascii',
176+ # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
177+ # exception anyway after this function is called, even though I think
178+ # it's doing what it should. It seems that the strict encoder is called
179+ # to encode the unicode string that this function returns ...
180+ decoded = replace_surrogate_encode (mystring )
181+ else :
182+ raise exc
183+ except NotASurrogateError :
184+ raise exc
185+ return (decoded , exc .end )
186+
187+
188+ class NotASurrogateError (Exception ):
189+ pass
190+
191+
192+ def replace_surrogate_encode (mystring ):
193+ """
194+ Returns a (unicode) string, not the more logical bytes, because the codecs
195+ register_error functionality expects this.
196+ """
197+ decoded = []
198+ for ch in mystring :
199+ # if PY3:
200+ # code = ch
201+ # else:
202+ code = ord (ch )
203+
204+ # The following magic comes from Py3.3's Python/codecs.c file:
205+ if not 0xD800 <= code <= 0xDCFF :
206+ # Not a surrogate. Fail with the original exception.
207+ raise exc
208+ # mybytes = [0xe0 | (code >> 12),
209+ # 0x80 | ((code >> 6) & 0x3f),
210+ # 0x80 | (code & 0x3f)]
211+ # Is this a good idea?
212+ if 0xDC00 <= code <= 0xDC7F :
213+ decoded .append (_unichr (code - 0xDC00 ))
214+ elif code <= 0xDCFF :
215+ decoded .append (_unichr (code - 0xDC00 ))
216+ else :
217+ raise NotASurrogateError
218+ return str ().join (decoded )
219+
220+
221+ def replace_surrogate_decode (mybytes ):
222+ """
223+ Returns a (unicode) string
224+ """
225+ decoded = []
226+ for ch in mybytes :
227+ # We may be parsing newbytes (in which case ch is an int) or a native
228+ # str on Py2
229+ if isinstance (ch , int ):
230+ code = ch
231+ else :
232+ code = ord (ch )
233+ if 0x80 <= code <= 0xFF :
234+ decoded .append (_unichr (0xDC00 + code ))
235+ elif code <= 0x7F :
236+ decoded .append (_unichr (code ))
237+ else :
238+ # # It may be a bad byte
239+ # # Try swallowing it.
240+ # continue
241+ # print("RAISE!")
242+ raise NotASurrogateError
243+ return str ().join (decoded )
244+
245+
246+ def encodefilename (fn ):
247+ if FS_ENCODING == 'ascii' :
248+ # ASCII encoder of Python 2 expects that the error handler returns a
249+ # Unicode string encodable to ASCII, whereas our surrogateescape error
250+ # handler has to return bytes in 0x80-0xFF range.
251+ encoded = []
252+ for index , ch in enumerate (fn ):
253+ code = ord (ch )
254+ if code < 128 :
255+ ch = bytes_chr (code )
256+ elif 0xDC80 <= code <= 0xDCFF :
257+ ch = bytes_chr (code - 0xDC00 )
258+ else :
259+ raise UnicodeEncodeError (FS_ENCODING ,
260+ fn , index , index + 1 ,
261+ 'ordinal not in range(128)' )
262+ encoded .append (ch )
263+ return bytes ().join (encoded )
264+ elif FS_ENCODING == 'utf-8' :
265+ # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
266+ # doesn't go through our error handler
267+ encoded = []
268+ for index , ch in enumerate (fn ):
269+ code = ord (ch )
270+ if 0xD800 <= code <= 0xDFFF :
271+ if 0xDC80 <= code <= 0xDCFF :
272+ ch = bytes_chr (code - 0xDC00 )
273+ encoded .append (ch )
274+ else :
275+ raise UnicodeEncodeError (
276+ FS_ENCODING ,
277+ fn , index , index + 1 , 'surrogates not allowed' )
278+ else :
279+ ch_utf8 = ch .encode ('utf-8' )
280+ encoded .append (ch_utf8 )
281+ return bytes ().join (encoded )
282+ else :
283+ return fn .encode (FS_ENCODING , FS_ERRORS )
284+
285+ def decodefilename (fn ):
286+ return fn .decode (FS_ENCODING , FS_ERRORS )
287+
288+ FS_ENCODING = 'ascii' ; fn = b ('[abc\xff ]' ); encoded = u ('[abc\udcff ]' )
289+ # FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
290+ # FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
291+
292+
293+ # normalize the filesystem encoding name.
294+ # For example, we expect "utf-8", not "UTF8".
295+ FS_ENCODING = codecs .lookup (FS_ENCODING ).name
296+
297+
298+ def register_surrogateescape ():
299+ """
300+ Registers the surrogateescape error handler on Python 2 (only)
301+ """
302+ if PY3 :
303+ return
304+ try :
305+ codecs .lookup_error (FS_ERRORS )
306+ except LookupError :
307+ codecs .register_error (FS_ERRORS , surrogateescape_handler )
308+
309+
310+ try :
311+ b"100644 \x9f \0 aaa" .decode (defenc , "surrogateescape" )
312+ except :
313+ register_surrogateescape ()
0 commit comments