From 05730400495a56514d16c40c60ac6749ee0fa49a Mon Sep 17 00:00:00 2001 From: Tim Burke Date: Wed, 13 Mar 2019 13:13:19 -0700 Subject: [PATCH] bpo-36274: Encode request lines with surrogate escapes While this is out of spec according to RFC 7230 (which limits expected octets to some subset of ASCII), it is often useful to be able to mimic an out-of-spec client when testing a server or application. Don't use Latin-1 (though that would be in keeping with how we handle headers and bodies) to encourage callers to write RFC-complient clients. Rather, use surrogate escape sequences ('\udc80' - '\udcff') to increase friction while still allowing out-of-spec requests to be expressable. https://bugs.python.org/issue36274 --- Lib/http/client.py | 5 +-- Lib/test/test_httplib.py | 34 +++++++++++++++++-- .../2019-07-08-09-20-10.bpo-36274.8XicsH.rst | 3 ++ 3 files changed, 38 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-07-08-09-20-10.bpo-36274.8XicsH.rst diff --git a/Lib/http/client.py b/Lib/http/client.py index f61267e108a524..eb7a55c9e8d7d5 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -1095,8 +1095,9 @@ def putrequest(self, method, url, skip_host=False, f"(found at least {match.group()!r})") request = '%s %s %s' % (method, url, self._http_vsn_str) - # Non-ASCII characters should have been eliminated earlier - self._output(request.encode('ascii')) + # Encode with surrogate escapes, to allow non-ascii bytes without + # making it too easy to write an out-of-spec client + self._output(request.encode('ascii', errors='surrogateescape')) if self._http_vsn == 11: # Issue some standard headers for better HTTP/1.1 compliance diff --git a/Lib/test/test_httplib.py b/Lib/test/test_httplib.py index 9148169cc7c2e5..9b7d2942fd7c51 100644 --- a/Lib/test/test_httplib.py +++ b/Lib/test/test_httplib.py @@ -275,6 +275,37 @@ def test_ipv6host_header(self): conn.request('GET', '/foo') self.assertTrue(sock.data.startswith(expected)) + def test_request_path_handling(self): + happy_cases = ( + ('/', b'/'), + ('', b'/'), + ('/\udce4\udcbd\udca0\udce5\udca5\udcbd', + b'/\xe4\xbd\xa0\xe5\xa5\xbd'), + ) + for caller_path, expected_path in happy_cases: + with self.subTest((caller_path, expected_path)): + conn = client.HTTPConnection('server.fqdn') + sock = FakeSocket('') + conn.sock = sock + conn.request('GET', caller_path) + expected = (b'GET ' + expected_path + b' HTTP/1.1\r\n' + b'Host: server.fqdn\r\n' + b'Accept-Encoding: identity\r\n\r\n') + self.assertEqual(sock.data, expected) + + error_cases = ( + '/\xe4\xbd\xa0\xe5\xa5\xbd', + '/\u4f60\u597d', + ) + for caller_path in error_cases: + with self.subTest(caller_path): + conn = client.HTTPConnection('server.fqdn') + sock = FakeSocket('') + conn.sock = sock + with self.assertRaises(UnicodeEncodeError): + conn.request('GET', caller_path) + self.assertEqual(sock.data, b'') + def test_malformed_headers_coped_with(self): # Issue 19996 body = "HTTP/1.1 200 OK\r\nFirst: val\r\n: nval\r\nSecond: val\r\n\r\n" @@ -720,8 +751,7 @@ def test_send_file(self): sock = FakeSocket(body) conn.sock = sock conn.request('GET', '/foo', body) - self.assertTrue(sock.data.startswith(expected), '%r != %r' % - (sock.data[:len(expected)], expected)) + self.assertEqual(sock.data[:len(expected)], expected) def test_send(self): expected = b'this is a test this is only a test' diff --git a/Misc/NEWS.d/next/Library/2019-07-08-09-20-10.bpo-36274.8XicsH.rst b/Misc/NEWS.d/next/Library/2019-07-08-09-20-10.bpo-36274.8XicsH.rst new file mode 100644 index 00000000000000..979a60f6c66220 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-07-08-09-20-10.bpo-36274.8XicsH.rst @@ -0,0 +1,3 @@ +``http.client`` can now make requests with non-ASCII request-targets using +surrogate escape sequences. Callers are still encouraged to URL-quote the +request-target instead so as to comply with the RFC.