Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 77b14a6

Browse filesBrowse files
authored
gh-69426: HTMLParser: only unescape properly terminated character entities in attribute values (GH-95215)
According to the HTML5 spec, named character references in attribute values should only be processed if they are not followed by an ASCII alphanumeric, or an equals sign. https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
1 parent 3dfed23 commit 77b14a6
Copy full SHA for 77b14a6

File tree

Expand file treeCollapse file tree

3 files changed

+57
-9
lines changed
Filter options
Expand file treeCollapse file tree

3 files changed

+57
-9
lines changed

‎Lib/html/parser.py

Copy file name to clipboardExpand all lines: Lib/html/parser.py
+19-1Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import _markupbase
1313

1414
from html import unescape
15+
from html.entities import html5 as html5_entities
1516

1617

1718
__all__ = ['HTMLParser']
@@ -23,6 +24,7 @@
2324

2425
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
2526
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
27+
attr_charref = re.compile(r'&(#[0-9]+|#[xX][0-9a-fA-F]+|[a-zA-Z][a-zA-Z0-9]*)[;=]?')
2628

2729
starttagopen = re.compile('<[a-zA-Z]')
2830
piclose = re.compile('>')
@@ -57,6 +59,22 @@
5759
# </ and the tag name, so maybe this should be fixed
5860
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
5961

62+
# Character reference processing logic specific to attribute values
63+
# See: https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
64+
def _replace_attr_charref(match):
65+
ref = match.group(0)
66+
# Numeric / hex char refs must always be unescaped
67+
if ref.startswith('&#'):
68+
return unescape(ref)
69+
# Named character / entity references must only be unescaped
70+
# if they are an exact match, and they are not followed by an equals sign
71+
if not ref.endswith('=') and ref[1:] in html5_entities:
72+
return unescape(ref)
73+
# Otherwise do not unescape
74+
return ref
75+
76+
def _unescape_attrvalue(s):
77+
return attr_charref.sub(_replace_attr_charref, s)
6078

6179

6280
class HTMLParser(_markupbase.ParserBase):
@@ -323,7 +341,7 @@ def parse_starttag(self, i):
323341
attrvalue[:1] == '"' == attrvalue[-1:]:
324342
attrvalue = attrvalue[1:-1]
325343
if attrvalue:
326-
attrvalue = unescape(attrvalue)
344+
attrvalue = _unescape_attrvalue(attrvalue)
327345
attrs.append((attrname.lower(), attrvalue))
328346
k = m.end()
329347

‎Lib/test/test_htmlparser.py

Copy file name to clipboardExpand all lines: Lib/test/test_htmlparser.py
+35-8Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -348,18 +348,16 @@ def test_convert_charrefs(self):
348348
collector = lambda: EventCollectorCharrefs()
349349
self.assertTrue(collector().convert_charrefs)
350350
charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
351-
# check charrefs in the middle of the text/attributes
352-
expected = [('starttag', 'a', [('href', 'foo"zar')]),
353-
('data', 'a"z'), ('endtag', 'a')]
351+
# check charrefs in the middle of the text
352+
expected = [('starttag', 'a', []), ('data', 'a"z'), ('endtag', 'a')]
354353
for charref in charrefs:
355-
self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
354+
self._run_check('<a>a{0}z</a>'.format(charref),
356355
expected, collector=collector())
357-
# check charrefs at the beginning/end of the text/attributes
358-
expected = [('data', '"'),
359-
('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
356+
# check charrefs at the beginning/end of the text
357+
expected = [('data', '"'), ('starttag', 'a', []),
360358
('data', '"'), ('endtag', 'a'), ('data', '"')]
361359
for charref in charrefs:
362-
self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
360+
self._run_check('{0}<a>'
363361
'{0}</a>{0}'.format(charref),
364362
expected, collector=collector())
365363
# check charrefs in <script>/<style> elements
@@ -382,6 +380,35 @@ def test_convert_charrefs(self):
382380
self._run_check('no charrefs here', [('data', 'no charrefs here')],
383381
collector=collector())
384382

383+
def test_convert_charrefs_in_attribute_values(self):
384+
# default value for convert_charrefs is now True
385+
collector = lambda: EventCollectorCharrefs()
386+
self.assertTrue(collector().convert_charrefs)
387+
388+
# always unescape terminated entity refs, numeric and hex char refs:
389+
# - regardless whether they are at start, middle, end of attribute
390+
# - or followed by alphanumeric, non-alphanumeric, or equals char
391+
charrefs = ['&cent;', '&#xa2;', '&#xa2', '&#162;', '&#162']
392+
expected = [('starttag', 'a',
393+
[('x', '¢'), ('x', 'z¢'), ('x', '¢z'),
394+
('x', 'z¢z'), ('x', '¢ z'), ('x', '¢=z')]),
395+
('endtag', 'a')]
396+
for charref in charrefs:
397+
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
398+
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
399+
.format(charref), expected, collector=collector())
400+
401+
# only unescape unterminated entity matches if they are not followed by
402+
# an alphanumeric or an equals sign
403+
charref = '&cent'
404+
expected = [('starttag', 'a',
405+
[('x', '¢'), ('x', 'z¢'), ('x', '&centz'),
406+
('x', 'z&centz'), ('x', '¢ z'), ('x', '&cent=z')]),
407+
('endtag', 'a')]
408+
self._run_check('<a x="{0}" x="z{0}" x="{0}z" '
409+
' x="z{0}z" x="{0} z" x="{0}=z"></a>'
410+
.format(charref), expected, collector=collector())
411+
385412
# the remaining tests were for the "tolerant" parser (which is now
386413
# the default), and check various kind of broken markup
387414
def test_tolerant_parsing(self):
+3Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix :class:`html.parser.HTMLParser` to not unescape character entities in
2+
attribute values if they are followed by an ASCII alphanumeric or an equals
3+
sign.

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.