Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit f644865

Browse filesBrowse files
Drew Hublgsnedders
Drew Hubl
authored andcommitted
Allow the data URI scheme, a whitelist for content types, and update tests to correctly check URIs
1 parent b51828b commit f644865
Copy full SHA for f644865

File tree

3 files changed

+49
-12
lines changed
Filter options

3 files changed

+49
-12
lines changed

‎AUTHORS.rst

Copy file name to clipboardExpand all lines: AUTHORS.rst
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,6 @@ Patches and suggestions
3232
- Juan Carlos Garcia Segovia
3333
- Mike West
3434
- Marc DM
35+
- Drew Hubl
36+
- Austin Kumbera
3537
- Jim Baker

‎html5lib/sanitizer.py

Copy file name to clipboardExpand all lines: html5lib/sanitizer.py
+30-5Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,26 @@
22

33
import re
44
from xml.sax.saxutils import escape, unescape
5+
from six.moves import urllib_parse as urlparse
56

67
from .tokenizer import HTMLTokenizer
78
from .constants import tokenTypes
89

910

11+
content_type_rgx = re.compile(r'''
12+
^
13+
# Match a content type <application>/<type>
14+
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
15+
# Match any character set and encoding
16+
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
17+
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
18+
# Assume the rest is data
19+
,.*
20+
$
21+
''',
22+
re.VERBOSE)
23+
24+
1025
class HTMLSanitizerMixin(object):
1126
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
1227

@@ -138,7 +153,9 @@ class HTMLSanitizerMixin(object):
138153
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
139154
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
140155
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
141-
'ssh', 'sftp', 'rtsp', 'afs']
156+
'ssh', 'sftp', 'rtsp', 'afs', 'data']
157+
158+
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
142159

143160
# subclasses may define their own versions of these constants
144161
allowed_elements = acceptable_elements + mathml_elements + svg_elements
@@ -147,6 +164,7 @@ class HTMLSanitizerMixin(object):
147164
allowed_css_keywords = acceptable_css_keywords
148165
allowed_svg_properties = acceptable_svg_properties
149166
allowed_protocols = acceptable_protocols
167+
allowed_content_types = acceptable_content_types
150168

151169
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
152170
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
@@ -189,10 +207,17 @@ def allowed_token(self, token, token_type):
189207
unescape(attrs[attr])).lower()
190208
# remove replacement characters from unescaped characters
191209
val_unescaped = val_unescaped.replace("\ufffd", "")
192-
if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
193-
(val_unescaped.split(':')[0] not in
194-
self.allowed_protocols)):
195-
del attrs[attr]
210+
uri = urlparse.urlparse(val_unescaped)
211+
if uri:
212+
if uri.scheme not in self.allowed_protocols:
213+
del attrs[attr]
214+
if uri.scheme == 'data':
215+
m = content_type_rgx.match(uri.path)
216+
if not m:
217+
del attrs[attr]
218+
if m.group('content_type') not in self.allowed_content_types:
219+
del attrs[attr]
220+
196221
for attr in self.svg_attr_val_allows_ref:
197222
if attr in attrs:
198223
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',

‎html5lib/tests/test_sanitizer.py

Copy file name to clipboardExpand all lines: html5lib/tests/test_sanitizer.py
+17-7Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,9 +80,12 @@ def test_sanitizer():
8080
continue # TODO
8181
if attribute_name == 'style':
8282
continue
83+
attribute_value = 'foo'
84+
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
85+
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
8386
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
84-
"<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
85-
"<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name,
87+
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
88+
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
8689
toxml)
8790

8891
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
@@ -93,13 +96,20 @@ def test_sanitizer():
9396
toxml)
9497

9598
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
96-
yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
97-
"<a href=\"%s\">foo</a>" % protocol,
98-
"""<a href="%s">foo</a>""" % protocol,
99+
rest_of_uri = '//sub.domain.tld/path/object.ext'
100+
if protocol == 'data':
101+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
102+
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
103+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
104+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
99105
toxml)
100106

101107
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
108+
rest_of_uri = '//sub.domain.tld/path/object.ext'
109+
if protocol == 'data':
110+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
111+
protocol = protocol.upper()
102112
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
103-
"<a href=\"%s\">foo</a>" % protocol,
104-
"""<a href="%s">foo</a>""" % protocol,
113+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
114+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
105115
toxml)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.