Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit ec731f4

Browse filesBrowse files
committed
Merge with gitpython-developers#532, fix unicode filenames with escapesurogates
2 parents b2efa1b + 9e4a454 commit ec731f4
Copy full SHA for ec731f4

7 files changed

+209-18Lines changed: 209 additions & 18 deletions

File tree

Expand file treeCollapse file tree
Open diff view settings
Filter options
Expand file treeCollapse file tree
Open diff view settings
Collapse file

‎VERSION‎

Copy file name to clipboard
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0.9dev0
1+
2.0.10dev0
Collapse file

‎git/compat.py‎

Copy file name to clipboardExpand all lines: git/compat.py
+191-1Lines changed: 191 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import locale
1111
import os
1212
import sys
13+
import codecs
14+
1315

1416
from gitdb.utils.compat import (
1517
xrange,
@@ -67,7 +69,7 @@ def safe_decode(s):
6769
if isinstance(s, unicode):
6870
return s
6971
elif isinstance(s, bytes):
70-
return s.decode(defenc, 'replace')
72+
return s.decode(defenc, 'surrogateescape')
7173
elif s is not None:
7274
raise TypeError('Expected bytes or text, but got %r' % (s,))
7375

@@ -121,3 +123,191 @@ def __str__(self):
121123
else: # Python 2
122124
def __str__(self):
123125
return self.__unicode__().encode(defenc)
126+
127+
128+
"""
129+
This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
130+
handler of Python 3.
131+
Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
132+
"""
133+
134+
# This code is released under the Python license and the BSD 2-clause license
135+
136+
137+
FS_ERRORS = 'surrogateescape'
138+
139+
# # -- Python 2/3 compatibility -------------------------------------
140+
# FS_ERRORS = 'my_surrogateescape'
141+
142+
def u(text):
143+
if PY3:
144+
return text
145+
else:
146+
return text.decode('unicode_escape')
147+
148+
def b(data):
149+
if PY3:
150+
return data.encode('latin1')
151+
else:
152+
return data
153+
154+
if PY3:
155+
_unichr = chr
156+
bytes_chr = lambda code: bytes((code,))
157+
else:
158+
_unichr = unichr
159+
bytes_chr = chr
160+
161+
def surrogateescape_handler(exc):
162+
"""
163+
Pure Python implementation of the PEP 383: the "surrogateescape" error
164+
handler of Python 3. Undecodable bytes will be replaced by a Unicode
165+
character U+DCxx on decoding, and these are translated into the
166+
original bytes on encoding.
167+
"""
168+
mystring = exc.object[exc.start:exc.end]
169+
170+
try:
171+
if isinstance(exc, UnicodeDecodeError):
172+
# mystring is a byte-string in this case
173+
decoded = replace_surrogate_decode(mystring)
174+
elif isinstance(exc, UnicodeEncodeError):
175+
# In the case of u'\udcc3'.encode('ascii',
176+
# 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
177+
# exception anyway after this function is called, even though I think
178+
# it's doing what it should. It seems that the strict encoder is called
179+
# to encode the unicode string that this function returns ...
180+
decoded = replace_surrogate_encode(mystring)
181+
else:
182+
raise exc
183+
except NotASurrogateError:
184+
raise exc
185+
return (decoded, exc.end)
186+
187+
188+
class NotASurrogateError(Exception):
189+
pass
190+
191+
192+
def replace_surrogate_encode(mystring):
193+
"""
194+
Returns a (unicode) string, not the more logical bytes, because the codecs
195+
register_error functionality expects this.
196+
"""
197+
decoded = []
198+
for ch in mystring:
199+
# if PY3:
200+
# code = ch
201+
# else:
202+
code = ord(ch)
203+
204+
# The following magic comes from Py3.3's Python/codecs.c file:
205+
if not 0xD800 <= code <= 0xDCFF:
206+
# Not a surrogate. Fail with the original exception.
207+
raise exc
208+
# mybytes = [0xe0 | (code >> 12),
209+
# 0x80 | ((code >> 6) & 0x3f),
210+
# 0x80 | (code & 0x3f)]
211+
# Is this a good idea?
212+
if 0xDC00 <= code <= 0xDC7F:
213+
decoded.append(_unichr(code - 0xDC00))
214+
elif code <= 0xDCFF:
215+
decoded.append(_unichr(code - 0xDC00))
216+
else:
217+
raise NotASurrogateError
218+
return str().join(decoded)
219+
220+
221+
def replace_surrogate_decode(mybytes):
222+
"""
223+
Returns a (unicode) string
224+
"""
225+
decoded = []
226+
for ch in mybytes:
227+
# We may be parsing newbytes (in which case ch is an int) or a native
228+
# str on Py2
229+
if isinstance(ch, int):
230+
code = ch
231+
else:
232+
code = ord(ch)
233+
if 0x80 <= code <= 0xFF:
234+
decoded.append(_unichr(0xDC00 + code))
235+
elif code <= 0x7F:
236+
decoded.append(_unichr(code))
237+
else:
238+
# # It may be a bad byte
239+
# # Try swallowing it.
240+
# continue
241+
# print("RAISE!")
242+
raise NotASurrogateError
243+
return str().join(decoded)
244+
245+
246+
def encodefilename(fn):
247+
if FS_ENCODING == 'ascii':
248+
# ASCII encoder of Python 2 expects that the error handler returns a
249+
# Unicode string encodable to ASCII, whereas our surrogateescape error
250+
# handler has to return bytes in 0x80-0xFF range.
251+
encoded = []
252+
for index, ch in enumerate(fn):
253+
code = ord(ch)
254+
if code < 128:
255+
ch = bytes_chr(code)
256+
elif 0xDC80 <= code <= 0xDCFF:
257+
ch = bytes_chr(code - 0xDC00)
258+
else:
259+
raise UnicodeEncodeError(FS_ENCODING,
260+
fn, index, index+1,
261+
'ordinal not in range(128)')
262+
encoded.append(ch)
263+
return bytes().join(encoded)
264+
elif FS_ENCODING == 'utf-8':
265+
# UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
266+
# doesn't go through our error handler
267+
encoded = []
268+
for index, ch in enumerate(fn):
269+
code = ord(ch)
270+
if 0xD800 <= code <= 0xDFFF:
271+
if 0xDC80 <= code <= 0xDCFF:
272+
ch = bytes_chr(code - 0xDC00)
273+
encoded.append(ch)
274+
else:
275+
raise UnicodeEncodeError(
276+
FS_ENCODING,
277+
fn, index, index+1, 'surrogates not allowed')
278+
else:
279+
ch_utf8 = ch.encode('utf-8')
280+
encoded.append(ch_utf8)
281+
return bytes().join(encoded)
282+
else:
283+
return fn.encode(FS_ENCODING, FS_ERRORS)
284+
285+
def decodefilename(fn):
286+
return fn.decode(FS_ENCODING, FS_ERRORS)
287+
288+
FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
289+
# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
290+
# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
291+
292+
293+
# normalize the filesystem encoding name.
294+
# For example, we expect "utf-8", not "UTF8".
295+
FS_ENCODING = codecs.lookup(FS_ENCODING).name
296+
297+
298+
def register_surrogateescape():
299+
"""
300+
Registers the surrogateescape error handler on Python 2 (only)
301+
"""
302+
if PY3:
303+
return
304+
try:
305+
codecs.lookup_error(FS_ERRORS)
306+
except LookupError:
307+
codecs.register_error(FS_ERRORS, surrogateescape_handler)
308+
309+
310+
try:
311+
b"100644 \x9f\0aaa".decode(defenc, "surrogateescape")
312+
except:
313+
register_surrogateescape()
Collapse file

‎git/objects/fun.py‎

Copy file name to clipboardExpand all lines: git/objects/fun.py
+2-5Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from stat import S_ISDIR
33
from git.compat import (
44
byte_ord,
5+
safe_decode,
56
defenc,
67
xrange,
78
text_type,
@@ -76,11 +77,7 @@ def tree_entries_from_data(data):
7677
# default encoding for strings in git is utf8
7778
# Only use the respective unicode object if the byte stream was encoded
7879
name = data[ns:i]
79-
try:
80-
name = name.decode(defenc)
81-
except UnicodeDecodeError:
82-
pass
83-
# END handle encoding
80+
name = safe_decode(name)
8481

8582
# byte is NULL, get next 20
8683
i += 1
Collapse file

‎git/test/performance/test_commit.py‎

Copy file name to clipboardExpand all lines: git/test/performance/test_commit.py
+1-1Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_iteration(self):
5252
# END for each object
5353
# END for each commit
5454
elapsed_time = time() - st
55-
print("Traversed %i Trees and a total of %i unchached objects in %s [s] ( %f objs/s )"
55+
print("Traversed %i Trees and a total of %i uncached objects in %s [s] ( %f objs/s )"
5656
% (nc, no, elapsed_time, no / elapsed_time), file=sys.stderr)
5757

5858
def test_commit_traversal(self):
Collapse file

‎git/test/test_fun.py‎

Copy file name to clipboardExpand all lines: git/test/test_fun.py
+11-7Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
from io import BytesIO
2-
from stat import (
3-
S_IFDIR,
4-
S_IFREG,
5-
S_IFLNK
6-
)
2+
from stat import S_IFDIR, S_IFREG, S_IFLNK
3+
from unittest.case import skipIf
74

5+
from git.compat import PY3
86
from git.index import IndexFile
97
from git.index.fun import (
108
aggressive_tree_merge
@@ -253,6 +251,12 @@ def test_tree_traversal_single(self):
253251
assert entries
254252
# END for each commit
255253

256-
def test_tree_entries_from_data_with_failing_name_decode(self):
254+
@skipIf(PY3, 'odd types returned ... maybe figure it out one day')
255+
def test_tree_entries_from_data_with_failing_name_decode_py2(self):
256+
r = tree_entries_from_data(b'100644 \x9f\0aaa')
257+
assert r == [('aaa', 33188, u'\udc9f')], r
258+
259+
@skipIf(not PY3, 'odd types returned ... maybe figure it out one day')
260+
def test_tree_entries_from_data_with_failing_name_decode_py3(self):
257261
r = tree_entries_from_data(b'100644 \x9f\0aaa')
258-
assert r == [(b'aaa', 33188, b'\x9f')], r
262+
assert r == [(b'aaa', 33188, '\udc9f')], r
Collapse file

‎setup.py‎

Copy file name to clipboardExpand all lines: setup.py
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def _stamp_version(filename):
6464
else:
6565
print("WARNING: Couldn't find version line in file %s" % filename, file=sys.stderr)
6666

67-
install_requires = ['gitdb >= 0.6.4']
67+
install_requires = ['gitdb2 >= 2.0.0']
6868
extras_require = {
6969
':python_version == "2.6"': ['ordereddict'],
7070
}
@@ -100,7 +100,7 @@ def _stamp_version(filename):
100100
package_data={'git.test': ['fixtures/*']},
101101
package_dir={'git': 'git'},
102102
license="BSD License",
103-
requires=['gitdb (>=0.6.4)'],
103+
requires=['gitdb2 (>=2.0.0)'],
104104
install_requires=install_requires,
105105
test_requirements=test_requires + install_requires,
106106
zip_safe=False,

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.