Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit b51828b

Browse filesBrowse files
committed
Allow for Python implementations that don't support lone surrogates (read: Jython).
This is based on earlier work by Jim Baker (thanks!). The two major parts of this are: * Avoiding having lone surrogates in any string literals, and * Avoiding tests that contain lone surrogates. As part of this, the decoder for double-escaped tokenizer tests is rewritten to avoid unicode_escape as that has bogus behaviour with non-ASCII characters.
1 parent b293489 commit b51828b
Copy full SHA for b51828b

File tree

5 files changed

+87
-14
lines changed
Filter options

5 files changed

+87
-14
lines changed

‎AUTHORS.rst

Copy file name to clipboardExpand all lines: AUTHORS.rst
+1Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ Patches and suggestions
3232
- Juan Carlos Garcia Segovia
3333
- Mike West
3434
- Marc DM
35+
- Jim Baker

‎CHANGES.rst

Copy file name to clipboardExpand all lines: CHANGES.rst
+3-2Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@ Change Log
44
0.9999
55
~~~~~~
66

7-
Released on XXX, 2014
7+
Released on XXX, 2015
88

9-
* XXX
9+
* Add support for Python implementations that don't support lone surrogates
10+
(read: Jython).
1011

1112

1213
0.999

‎html5lib/inputstream.py

Copy file name to clipboardExpand all lines: html5lib/inputstream.py
+26-9Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,18 @@ class BufferedIOBase(object):
2828
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
2929
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
3030

31-
invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
31+
32+
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
33+
34+
if utils.supports_lone_surrogates:
35+
# Use one extra step of indirection and create surrogates with
36+
# unichr. Not using this indirection would introduce an illegal
37+
# unicode literal on platforms not supporting such lone
38+
# surrogates.
39+
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
40+
eval('"\\uD800-\\uDFFF"'))
41+
else:
42+
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
3243

3344
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
3445
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@@ -164,13 +175,18 @@ def __init__(self, source):
164175
165176
"""
166177

167-
# Craziness
168-
if len("\U0010FFFF") == 1:
178+
if not utils.supports_lone_surrogates:
179+
# Such platforms will have already checked for such
180+
# surrogate errors, so no need to do this checking.
181+
self.reportCharacterErrors = None
182+
self.replaceCharactersRegexp = None
183+
elif len("\U0010FFFF") == 1:
169184
self.reportCharacterErrors = self.characterErrorsUCS4
170-
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
185+
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
171186
else:
172187
self.reportCharacterErrors = self.characterErrorsUCS2
173-
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
188+
self.replaceCharactersRegexp = re.compile(
189+
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
174190

175191
# List of where new lines occur
176192
self.newLines = [0]
@@ -265,11 +281,12 @@ def readChunk(self, chunkSize=None):
265281
self._bufferedCharacter = data[-1]
266282
data = data[:-1]
267283

268-
self.reportCharacterErrors(data)
284+
if self.reportCharacterErrors:
285+
self.reportCharacterErrors(data)
269286

270-
# Replace invalid characters
271-
# Note U+0000 is dealt with in the tokenizer
272-
data = self.replaceCharactersRegexp.sub("\ufffd", data)
287+
# Replace invalid characters
288+
# Note U+0000 is dealt with in the tokenizer
289+
data = self.replaceCharactersRegexp.sub("\ufffd", data)
273290

274291
data = data.replace("\r\n", "\n")
275292
data = data.replace("\r", "\n")

‎html5lib/tests/test_tokenizer.py

Copy file name to clipboardExpand all lines: html5lib/tests/test_tokenizer.py
+35-2Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
import warnings
55
import re
66

7+
from six import unichr
8+
79
from .support import get_data_files
810

911
from html5lib.tokenizer import HTMLTokenizer
10-
from html5lib import constants
12+
from html5lib import constants, utils
1113

1214

1315
class TokenizerTestParser(object):
@@ -122,9 +124,38 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
122124
return tokens["expected"] == tokens["received"]
123125

124126

127+
_surrogateRe = re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")
128+
129+
125130
def unescape(test):
126131
def decode(inp):
127-
return inp.encode("utf-8").decode("unicode-escape")
132+
"""Decode \\uXXXX escapes
133+
134+
This decodes \\uXXXX escapes, possibly into non-BMP characters when
135+
two surrogate character escapes are adjacent to each other.
136+
"""
137+
# This cannot be implemented using the unicode_escape codec
138+
# because that requires its input be ISO-8859-1, and we need
139+
# arbitrary unicode as input.
140+
def repl(m):
141+
if m.group(2) is not None:
142+
high = int(m.group(1), 16)
143+
low = int(m.group(2), 16)
144+
if 0xD800 <= high <= 0xDBFF and 0xDC00 <= low <= 0xDFFF:
145+
cp = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x10000
146+
return unichr(cp)
147+
else:
148+
return unichr(high) + unichr(low)
149+
else:
150+
return unichr(int(m.group(1), 16))
151+
try:
152+
return _surrogateRe.sub(repl, inp)
153+
except ValueError:
154+
# This occurs when unichr throws ValueError, which should
155+
# only be for a lone-surrogate.
156+
if utils.supports_lone_surrogates:
157+
raise
158+
return None
128159

129160
test["input"] = decode(test["input"])
130161
for token in test["output"]:
@@ -183,6 +214,8 @@ def testTokenizer():
183214
test["initialStates"] = ["Data state"]
184215
if 'doubleEscaped' in test:
185216
test = unescape(test)
217+
if test["input"] is None:
218+
continue # Not valid input for this platform
186219
for initialState in test["initialStates"]:
187220
test["initialState"] = capitalize(initialState)
188221
yield runTokenizerTest, test

‎html5lib/utils.py

Copy file name to clipboardExpand all lines: html5lib/utils.py
+22-1Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,35 @@
22

33
from types import ModuleType
44

5+
from six import text_type
6+
57
try:
68
import xml.etree.cElementTree as default_etree
79
except ImportError:
810
import xml.etree.ElementTree as default_etree
911

1012

1113
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
12-
"surrogatePairToCodepoint", "moduleFactoryFactory"]
14+
"surrogatePairToCodepoint", "moduleFactoryFactory",
15+
"supports_lone_surrogates"]
16+
17+
18+
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
19+
# caught by the below test. In general this would be any platform
20+
# using UTF-16 as its encoding of unicode strings, such as
21+
# Jython. This is because UTF-16 itself is based on the use of such
22+
# surrogates, and there is no mechanism to further escape such
23+
# escapes.
24+
try:
25+
_x = eval('"\\uD800"')
26+
if not isinstance(_x, text_type):
27+
# We need this with u"" because of http://bugs.jython.org/issue2039
28+
_x = eval('u"\\uD800"')
29+
assert isinstance(_x, text_type)
30+
except:
31+
supports_lone_surrogates = False
32+
else:
33+
supports_lone_surrogates = True
1334

1435

1536
class MethodDispatcher(dict):

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.