Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

bpo-43882 - urllib.parse should sanitize urls containing ASCII newline and tabs. #25595

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 29, 2021
Merged
Prev Previous commit
Next Next commit
Address Code Review Comments.
  • Loading branch information
orsenthil committed Apr 28, 2021
commit ee779d61c8f70f4fd68847f9820cf25e06306f94
11 changes: 8 additions & 3 deletions 11 Doc/library/urllib.parse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,8 @@ or on combining URL components into a URL string.
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
decomposed before parsing, no error will be raised.

Following the specification in WHATWG which updates RFC 3986, ASCII newline
``\n``, ``\r`` or ``\r\n`` and tab ``\t`` characters are stripped from
the url.
Following the specification in `WHATWG`_ which updates RFC 3986, ASCII newline
``\n``, ``\r`` and tab ``\t`` characters are stripped from the url.
orsenthil marked this conversation as resolved.
Show resolved Hide resolved

.. versionchanged:: 3.6
Out-of-range port numbers now raise :exc:`ValueError`, instead of
Expand Down Expand Up @@ -680,6 +679,10 @@ task isn't already covered by the URL parsing functions above.

.. seealso::

`WHATWG`_ - URL Living standard
Working Group for the URL Standard that defines URLs, domains, IP addresses, the
application/x-www-form-urlencoded format, and their API.

:rfc:`3986` - Uniform Resource Identifiers
This is the current standard (STD66). Any changes to urllib.parse module
should conform to this. Certain deviations could be observed, which are
Expand All @@ -703,3 +706,5 @@ task isn't already covered by the URL parsing functions above.

:rfc:`1738` - Uniform Resource Locators (URL)
This specifies the formal syntax and semantics of absolute URLs.

.. _WHATWG: https://url.spec.whatwg.org/#concept-basic-url-parser
43 changes: 22 additions & 21 deletions 43 Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,21 +566,6 @@ def test_urlsplit_attributes(self):
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), url)

# Remove ASCII tabs and newlines from input

url = "http://www.python.org/java\nscript:\talert('msg\r\n')/#frag"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, "http")
self.assertEqual(p.netloc, "www.python.org")
self.assertEqual(p.path, "/javascript:alert('msg')/")
self.assertEqual(p.query, "")
self.assertEqual(p.fragment, "frag")
self.assertEqual(p.username, None)
self.assertEqual(p.password, None)
self.assertEqual(p.hostname, "www.python.org")
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/#frag")

# And check them all again, only with bytes this time
url = b"HTTP://WWW.PYTHON.ORG/doc/#frag"
p = urllib.parse.urlsplit(url)
Expand Down Expand Up @@ -621,6 +606,28 @@ def test_urlsplit_attributes(self):
self.assertEqual(p.port, 80)
self.assertEqual(p.geturl(), url)

# Verify an illegal port raises ValueError
url = b"HTTP://WWW.PYTHON.ORG:65536/doc/#frag"
p = urllib.parse.urlsplit(url)
with self.assertRaisesRegex(ValueError, "out of range"):
p.port

def test_urlsplit_remove_unsafe_bytes(self):
# Remove ASCII tabs and newlines from input
url = "http://www.python.org/java\nscript:\talert('msg\r\n')/#frag"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, "http")
self.assertEqual(p.netloc, "www.python.org")
self.assertEqual(p.path, "/javascript:alert('msg')/")
self.assertEqual(p.query, "")
self.assertEqual(p.fragment, "frag")
self.assertEqual(p.username, None)
self.assertEqual(p.password, None)
self.assertEqual(p.hostname, "www.python.org")
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), "http://www.python.org/javascript:alert('msg')/#frag")

# Remove ASCII tabs and newlines from input as bytes.
url = b"http://www.python.org/java\nscript:\talert('msg\r\n')/#frag"
p = urllib.parse.urlsplit(url)
self.assertEqual(p.scheme, b"http")
Expand All @@ -634,12 +641,6 @@ def test_urlsplit_attributes(self):
self.assertEqual(p.port, None)
self.assertEqual(p.geturl(), b"http://www.python.org/javascript:alert('msg')/#frag")

# Verify an illegal port raises ValueError
url = b"HTTP://WWW.PYTHON.ORG:65536/doc/#frag"
p = urllib.parse.urlsplit(url)
with self.assertRaisesRegex(ValueError, "out of range"):
p.port

def test_attributes_bad_port(self):
"""Check handling of invalid ports."""
for bytes in (False, True):
Expand Down
7 changes: 5 additions & 2 deletions 7 Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@
'0123456789'
'+-.')

# Unsafe bytes to be removed per WHATWG spec
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
gpshead marked this conversation as resolved.
Show resolved Hide resolved

# XXX: Consider replacing with functools.lru_cache
MAX_CACHE_SIZE = 20
_parse_cache = {}
Expand Down Expand Up @@ -469,8 +472,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
else:
scheme, url = url[:i].lower(), url[i+1:]

_unsafe_chars_to_remove = ['\t', '\r', '\n']
url = url.translate({ord(c): None for c in _unsafe_chars_to_remove})
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
url = url.replace(b, "")

if url[:2] == '//':
netloc, url = _splitnetloc(url, 2)
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.