Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit fc9f63b

Browse filesBrowse files
committed
Fix #120: introduce keyword arguments for encodings by source
1 parent 6464fc4 commit fc9f63b
Copy full SHA for fc9f63b

File tree

7 files changed

+133
-65
lines changed
Filter options

7 files changed

+133
-65
lines changed

‎CHANGES.rst

Copy file name to clipboardExpand all lines: CHANGES.rst
+4Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ Released on XXX
4646

4747
* **Drop support of charade, now that chardet is supported once more.**
4848

49+
* **Replace the charset keyword argument on parse and related methods
50+
with a set of keyword arguments: override_encoding, transport_encoding,
51+
same_origin_parent_encoding, likely_encoding, and default_encoding.**
52+
4953

5054
0.9999999/1.0b8
5155
~~~~~~~~~~~~~~~

‎README.rst

Copy file name to clipboardExpand all lines: README.rst
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pass into html5lib as follows:
5151
import html5lib
5252
5353
with closing(urlopen("http://example.com/")) as f:
54-
document = html5lib.parse(f, encoding=f.info().getparam("charset"))
54+
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
5555
5656
When using with ``urllib.request`` (Python 3), the charset from HTTP
5757
should be pass into html5lib as follows:
@@ -62,7 +62,7 @@ should be pass into html5lib as follows:
6262
import html5lib
6363
6464
with urlopen("http://example.com/") as f:
65-
document = html5lib.parse(f, encoding=f.info().get_content_charset())
65+
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
6666
6767
To have more control over the parser, create a parser object explicitly.
6868
For instance, to make the parser raise exceptions on parse errors, use:

‎html5lib/html5parser.py

Copy file name to clipboardExpand all lines: html5lib/html5parser.py
+10-20Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,17 @@
2828
)
2929

3030

31-
def parse(doc, treebuilder="etree", encoding=None,
32-
namespaceHTMLElements=True, scripting=False):
31+
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
3332
"""Parse a string or file-like object into a tree"""
3433
tb = treebuilders.getTreeBuilder(treebuilder)
3534
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
36-
return p.parse(doc, encoding=encoding, scripting=scripting)
35+
return p.parse(doc, **kwargs)
3736

3837

39-
def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
40-
namespaceHTMLElements=True, scripting=False):
38+
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
4139
tb = treebuilders.getTreeBuilder(treebuilder)
4240
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
43-
return p.parseFragment(doc, container=container, encoding=encoding, scripting=scripting)
41+
return p.parseFragment(doc, container=container, **kwargs)
4442

4543

4644
def method_decorator_metaclass(function):
@@ -79,15 +77,12 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa
7977
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
8078
getPhases(debug).items()])
8179

82-
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
83-
useChardet=True, scripting=False, **kwargs):
80+
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
8481

8582
self.innerHTMLMode = innerHTML
8683
self.container = container
8784
self.scripting = scripting
88-
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding=encoding,
89-
useChardet=useChardet,
90-
parser=self, **kwargs)
85+
self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
9186
self.reset()
9287

9388
try:
@@ -225,8 +220,7 @@ def normalizedTokens(self):
225220
for token in self.tokenizer:
226221
yield self.normalizeToken(token)
227222

228-
def parse(self, stream, encoding=None,
229-
useChardet=True, scripting=False):
223+
def parse(self, stream, *args, **kwargs):
230224
"""Parse a HTML document into a well-formed tree
231225
232226
stream - a filelike object or string containing the HTML to be parsed
@@ -238,13 +232,10 @@ def parse(self, stream, encoding=None,
238232
239233
scripting - treat noscript elements as if javascript was turned on
240234
"""
241-
self._parse(stream, innerHTML=False, encoding=encoding,
242-
useChardet=useChardet, scripting=scripting)
235+
self._parse(stream, False, None, *args, **kwargs)
243236
return self.tree.getDocument()
244237

245-
def parseFragment(self, stream, container="div", encoding=None,
246-
useChardet=True, scripting=False):
247-
# pylint:disable=unused-argument
238+
def parseFragment(self, stream, *args, **kwargs):
248239
"""Parse a HTML fragment into a well-formed tree fragment
249240
250241
container - name of the element we're setting the innerHTML property
@@ -259,8 +250,7 @@ def parseFragment(self, stream, container="div", encoding=None,
259250
260251
scripting - treat noscript elements as if javascript was turned on
261252
"""
262-
self._parse(stream, True, container=container,
263-
encoding=encoding, scripting=scripting)
253+
self._parse(stream, True, *args, **kwargs)
264254
return self.tree.getFragment()
265255

266256
def parseError(self, errorcode="XXX-undefined-error", datavars=None):

‎html5lib/inputstream.py

Copy file name to clipboardExpand all lines: html5lib/inputstream.py
+62-36Lines changed: 62 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128
return b"".join(rv)
129129

130130

131-
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
131+
def HTMLInputStream(source, **kwargs):
132132
# Work around Python bug #20007: read(0) closes the connection.
133133
# http://bugs.python.org/issue20007
134134
if (isinstance(source, http_client.HTTPResponse) or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142
isUnicode = isinstance(source, text_type)
143143

144144
if isUnicode:
145-
if encoding is not None:
146-
raise TypeError("Cannot explicitly set an encoding with a unicode string")
145+
encodings = [x for x in kwargs if x.endswith("_encoding")]
146+
if encodings:
147+
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
147148

148-
return HTMLUnicodeInputStream(source)
149+
return HTMLUnicodeInputStream(source, **kwargs)
149150
else:
150-
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
151+
return HTMLBinaryInputStream(source, **kwargs)
151152

152153

153154
class HTMLUnicodeInputStream(object):
@@ -173,8 +174,6 @@ def __init__(self, source):
173174
regardless of any BOM or later declaration (such as in a meta
174175
element)
175176
176-
parseMeta - Look for a <meta> element containing encoding information
177-
178177
"""
179178

180179
if not utils.supports_lone_surrogates:
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390389
391390
"""
392391

393-
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
392+
def __init__(self, source, override_encoding=None, transport_encoding=None,
393+
same_origin_parent_encoding=None, likely_encoding=None,
394+
default_encoding="windows-1252", useChardet=True):
394395
"""Initialises the HTMLInputStream.
395396
396397
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403404
regardless of any BOM or later declaration (such as in a meta
404405
element)
405406
406-
parseMeta - Look for a <meta> element containing encoding information
407-
408407
"""
409408
# Raw Stream - for unicode objects this will encode to utf-8 and set
410409
# self.charEncoding as appropriate
411410
self.rawStream = self.openStream(source)
412411

413412
HTMLUnicodeInputStream.__init__(self, self.rawStream)
414413

415-
self.charEncoding = (lookupEncoding(encoding), "certain")
416-
417414
# Encoding Information
418415
# Number of bytes to use when looking for a meta element with
419416
# encoding information
420417
self.numBytesMeta = 1024
421418
# Number of bytes to use when using detecting encoding using chardet
422419
self.numBytesChardet = 100
423-
# Encoding to use if no other information can be found
424-
self.defaultEncoding = "windows-1252"
420+
# Things from args
421+
self.override_encoding = override_encoding
422+
self.transport_encoding = transport_encoding
423+
self.same_origin_parent_encoding = same_origin_parent_encoding
424+
self.likely_encoding = likely_encoding
425+
self.default_encoding = default_encoding
425426

426-
# Detect encoding iff no explicit "transport level" encoding is supplied
427-
if (self.charEncoding[0] is None):
428-
self.charEncoding = self.detectEncoding(parseMeta, chardet)
429-
assert self.charEncoding[0] is not None
427+
# Determine encoding
428+
self.charEncoding = self.determineEncoding(useChardet)
429+
assert self.charEncoding[0] is not None
430430

431431
# Call superclass
432432
self.reset()
@@ -454,21 +454,45 @@ def openStream(self, source):
454454

455455
return stream
456456

457-
def detectEncoding(self, parseMeta=True, chardet=True):
458-
# First look for a BOM
457+
def determineEncoding(self, chardet=True):
458+
# BOMs take precedence over everything
459459
# This will also read past the BOM if present
460-
encoding = self.detectBOM()
461-
confidence = "certain"
462-
# If there is no BOM need to look for meta elements with encoding
463-
# information
464-
if encoding is None and parseMeta:
465-
encoding = self.detectEncodingMeta()
466-
confidence = "tentative"
460+
charEncoding = self.detectBOM(), "certain"
461+
if charEncoding[0] is not None:
462+
return charEncoding
463+
464+
# If we've been overriden, we've been overriden
465+
charEncoding = lookupEncoding(self.override_encoding), "certain"
466+
if charEncoding[0] is not None:
467+
return charEncoding
468+
469+
# Now check the transport layer
470+
charEncoding = lookupEncoding(self.transport_encoding), "certain"
471+
if charEncoding[0] is not None:
472+
return charEncoding
473+
474+
# Look for meta elements with encoding information
475+
charEncoding = self.detectEncodingMeta(), "tentative"
476+
if charEncoding[0] is not None:
477+
return charEncoding
478+
479+
# Parent document encoding
480+
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
481+
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
482+
return charEncoding
483+
484+
# "likely" encoding
485+
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
486+
if charEncoding[0] is not None:
487+
return charEncoding
488+
467489
# Guess with chardet, if available
468-
if encoding is None and chardet:
469-
confidence = "tentative"
490+
if chardet:
470491
try:
471492
from chardet.universaldetector import UniversalDetector
493+
except ImportError:
494+
pass
495+
else:
472496
buffers = []
473497
detector = UniversalDetector()
474498
while not detector.done:
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481505
detector.close()
482506
encoding = lookupEncoding(detector.result['encoding'])
483507
self.rawStream.seek(0)
484-
except ImportError:
485-
pass
486-
# If all else fails use the default encoding
487-
if encoding is None:
488-
confidence = "tentative"
489-
encoding = lookupEncoding(self.defaultEncoding)
508+
if encoding is not None:
509+
return encoding, "tentative"
510+
511+
# Try the default encoding
512+
charEncoding = lookupEncoding(self.default_encoding), "tentative"
513+
if charEncoding[0] is not None:
514+
return charEncoding
490515

491-
return encoding, confidence
516+
# Fallback to html5lib's default if even that hasn't worked
517+
return lookupEncoding("windows-1252"), "tentative"
492518

493519
def changeEncoding(self, newEncoding):
494520
assert self.charEncoding[1] != "certain"

‎html5lib/tests/test_encoding.py

Copy file name to clipboardExpand all lines: html5lib/tests/test_encoding.py
+51-3Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import os
44

5+
import pytest
6+
57
from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
68
from html5lib import HTMLParser, inputstream
79

@@ -11,7 +13,7 @@ def test_basic_prescan_length():
1113
pad = 1024 - len(data) + 1
1214
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
1315
assert len(data) == 1024 # Sanity
14-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
16+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
1517
assert 'utf-8' == stream.charEncoding[0].name
1618

1719

@@ -20,14 +22,59 @@ def test_parser_reparse():
2022
pad = 10240 - len(data) + 1
2123
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
2224
assert len(data) == 10240 # Sanity
23-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
25+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
2426
assert 'windows-1252' == stream.charEncoding[0].name
2527
p = HTMLParser(namespaceHTMLElements=False)
2628
doc = p.parse(data, useChardet=False)
2729
assert 'utf-8' == p.documentEncoding
2830
assert doc.find(".//title").text == "Caf\u00E9"
2931

3032

33+
@pytest.mark.parametrize("expected,data,kwargs", [
34+
("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}),
35+
("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}),
36+
("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}),
37+
("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}),
38+
("iso-8859-2", b"<meta charset=iso-8859-3>", {"transport_encoding": "iso-8859-2"}),
39+
("iso-8859-2", b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding": "iso-8859-3"}),
40+
("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}),
41+
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}),
42+
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}),
43+
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}),
44+
("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}),
45+
("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}),
46+
("windows-1252", b"", {"default_encoding": "totally-bogus-string"}),
47+
("windows-1252", b"", {}),
48+
])
49+
def test_parser_args(expected, data, kwargs):
50+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
51+
assert expected == stream.charEncoding[0].name
52+
p = HTMLParser()
53+
p.parse(data, useChardet=False, **kwargs)
54+
assert expected == p.documentEncoding
55+
56+
57+
@pytest.mark.parametrize("kwargs", [
58+
{"override_encoding": "iso-8859-2"},
59+
{"override_encoding": None},
60+
{"transport_encoding": "iso-8859-2"},
61+
{"transport_encoding": None},
62+
{"same_origin_parent_encoding": "iso-8859-2"},
63+
{"same_origin_parent_encoding": None},
64+
{"likely_encoding": "iso-8859-2"},
65+
{"likely_encoding": None},
66+
{"default_encoding": "iso-8859-2"},
67+
{"default_encoding": None},
68+
{"foo_encoding": "iso-8859-2"},
69+
{"foo_encoding": None},
70+
])
71+
def test_parser_args_raises(kwargs):
72+
with pytest.raises(TypeError) as exc_info:
73+
p = HTMLParser()
74+
p.parse("", useChardet=False, **kwargs)
75+
assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
76+
77+
3178
def runParserEncodingTest(data, encoding):
3279
p = HTMLParser()
3380
assert p.documentEncoding is None
@@ -38,7 +85,7 @@ def runParserEncodingTest(data, encoding):
3885

3986

4087
def runPreScanEncodingTest(data, encoding):
41-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
88+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
4289
encoding = encoding.lower().decode("ascii")
4390

4491
# Very crude way to ignore irrelevant tests
@@ -55,6 +102,7 @@ def test_encoding():
55102
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
56103
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
57104

105+
58106
# pylint:disable=wrong-import-position
59107
try:
60108
import chardet # noqa

‎html5lib/tests/test_stream.py

Copy file name to clipboardExpand all lines: html5lib/tests/test_stream.py
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,13 @@ class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
9999

100100

101101
def test_char_ascii():
102-
stream = HTMLInputStream(b"'", encoding='ascii')
102+
stream = HTMLInputStream(b"'", override_encoding='ascii')
103103
assert stream.charEncoding[0].name == 'windows-1252'
104104
assert stream.char() == "'"
105105

106106

107107
def test_char_utf8():
108-
stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8')
108+
stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8')
109109
assert stream.charEncoding[0].name == 'utf-8'
110110
assert stream.char() == '\u2018'
111111

‎html5lib/tokenizer.py

Copy file name to clipboardExpand all lines: html5lib/tokenizer.py
+2-2Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ class HTMLTokenizer(object):
3131
Points to HTMLInputStream object.
3232
"""
3333

34-
def __init__(self, stream, encoding=None, useChardet=True, parser=None):
34+
def __init__(self, stream, parser=None, **kwargs):
3535

36-
self.stream = HTMLInputStream(stream, encoding, True, useChardet)
36+
self.stream = HTMLInputStream(stream, **kwargs)
3737
self.parser = parser
3838

3939
# Setup the initial tokenizer state

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.