Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 7d1f50c

Browse filesBrowse files
sethmlarsonEclips4gpshead
authored
[3.8] gh-121285: Remove backtracking when parsing tarfile headers (GH-121286) (#123642)
* Remove backtracking when parsing tarfile headers * Rewrite PAX header parsing to be stricter * Optimize parsing of GNU extended sparse headers v0.0 (cherry picked from commit 34ddb64) Co-authored-by: Seth Michael Larson <seth@python.org> Co-authored-by: Kirill Podoprigora <kirill.bast9@mail.ru> Co-authored-by: Gregory P. Smith <greg@krypto.org>
1 parent 7bc367e commit 7d1f50c
Copy full SHA for 7d1f50c

File tree

3 files changed

+111
-38
lines changed
Filter options

3 files changed

+111
-38
lines changed

‎Lib/tarfile.py

Copy file name to clipboardExpand all lines: Lib/tarfile.py
+67-38Lines changed: 67 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,9 @@ def data_filter(member, dest_path):
840840
# Sentinel for replace() defaults, meaning "don't change the attribute"
841841
_KEEP = object()
842842

843+
# Header length is digits followed by a space.
844+
_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ")
845+
843846
class TarInfo(object):
844847
"""Informational class which holds the details about an
845848
archive member given by a tar header block.
@@ -1390,59 +1393,76 @@ def _proc_pax(self, tarfile):
13901393
else:
13911394
pax_headers = tarfile.pax_headers.copy()
13921395

1393-
# Check if the pax header contains a hdrcharset field. This tells us
1394-
# the encoding of the path, linkpath, uname and gname fields. Normally,
1395-
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
1396-
# implementations are allowed to store them as raw binary strings if
1397-
# the translation to UTF-8 fails.
1398-
match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1399-
if match is not None:
1400-
pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1401-
1402-
# For the time being, we don't care about anything other than "BINARY".
1403-
# The only other value that is currently allowed by the standard is
1404-
# "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1405-
hdrcharset = pax_headers.get("hdrcharset")
1406-
if hdrcharset == "BINARY":
1407-
encoding = tarfile.encoding
1408-
else:
1409-
encoding = "utf-8"
1410-
14111396
# Parse pax header information. A record looks like that:
14121397
# "%d %s=%s\n" % (length, keyword, value). length is the size
14131398
# of the complete record including the length field itself and
1414-
# the newline. keyword and value are both UTF-8 encoded strings.
1415-
regex = re.compile(br"(\d+) ([^=]+)=")
1399+
# the newline.
14161400
pos = 0
1417-
while True:
1418-
match = regex.match(buf, pos)
1419-
if not match:
1420-
break
1401+
encoding = None
1402+
raw_headers = []
1403+
while len(buf) > pos and buf[pos] != 0x00:
1404+
if not (match := _header_length_prefix_re.match(buf, pos)):
1405+
raise InvalidHeaderError("invalid header")
1406+
try:
1407+
length = int(match.group(1))
1408+
except ValueError:
1409+
raise InvalidHeaderError("invalid header")
1410+
# Headers must be at least 5 bytes, shortest being '5 x=\n'.
1411+
# Value is allowed to be empty.
1412+
if length < 5:
1413+
raise InvalidHeaderError("invalid header")
1414+
if pos + length > len(buf):
1415+
raise InvalidHeaderError("invalid header")
14211416

1422-
length, keyword = match.groups()
1423-
length = int(length)
1424-
if length == 0:
1417+
header_value_end_offset = match.start(1) + length - 1 # Last byte of the header
1418+
keyword_and_value = buf[match.end(1) + 1:header_value_end_offset]
1419+
raw_keyword, equals, raw_value = keyword_and_value.partition(b"=")
1420+
1421+
# Check the framing of the header. The last character must be '\n' (0x0A)
1422+
if not raw_keyword or equals != b"=" or buf[header_value_end_offset] != 0x0A:
14251423
raise InvalidHeaderError("invalid header")
1426-
value = buf[match.end(2) + 1:match.start(1) + length - 1]
1424+
raw_headers.append((length, raw_keyword, raw_value))
1425+
1426+
# Check if the pax header contains a hdrcharset field. This tells us
1427+
# the encoding of the path, linkpath, uname and gname fields. Normally,
1428+
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
1429+
# implementations are allowed to store them as raw binary strings if
1430+
# the translation to UTF-8 fails. For the time being, we don't care about
1431+
# anything other than "BINARY". The only other value that is currently
1432+
# allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1433+
# Note that we only follow the initial 'hdrcharset' setting to preserve
1434+
# the initial behavior of the 'tarfile' module.
1435+
if raw_keyword == b"hdrcharset" and encoding is None:
1436+
if raw_value == b"BINARY":
1437+
encoding = tarfile.encoding
1438+
else: # This branch ensures only the first 'hdrcharset' header is used.
1439+
encoding = "utf-8"
1440+
1441+
pos += length
14271442

1443+
# If no explicit hdrcharset is set, we use UTF-8 as a default.
1444+
if encoding is None:
1445+
encoding = "utf-8"
1446+
1447+
# After parsing the raw headers we can decode them to text.
1448+
for length, raw_keyword, raw_value in raw_headers:
14281449
# Normally, we could just use "utf-8" as the encoding and "strict"
14291450
# as the error handler, but we better not take the risk. For
14301451
# example, GNU tar <= 1.23 is known to store filenames it cannot
14311452
# translate to UTF-8 as raw strings (unfortunately without a
14321453
# hdrcharset=BINARY header).
14331454
# We first try the strict standard encoding, and if that fails we
14341455
# fall back on the user's encoding and error handler.
1435-
keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1456+
keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8",
14361457
tarfile.errors)
14371458
if keyword in PAX_NAME_FIELDS:
1438-
value = self._decode_pax_field(value, encoding, tarfile.encoding,
1459+
value = self._decode_pax_field(raw_value, encoding, tarfile.encoding,
14391460
tarfile.errors)
14401461
else:
1441-
value = self._decode_pax_field(value, "utf-8", "utf-8",
1462+
value = self._decode_pax_field(raw_value, "utf-8", "utf-8",
14421463
tarfile.errors)
14431464

14441465
pax_headers[keyword] = value
1445-
pos += length
14461466

14471467
# Fetch the next header.
14481468
try:
@@ -1457,7 +1477,7 @@ def _proc_pax(self, tarfile):
14571477

14581478
elif "GNU.sparse.size" in pax_headers:
14591479
# GNU extended sparse format version 0.0.
1460-
self._proc_gnusparse_00(next, pax_headers, buf)
1480+
self._proc_gnusparse_00(next, raw_headers)
14611481

14621482
elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
14631483
# GNU extended sparse format version 1.0.
@@ -1479,15 +1499,24 @@ def _proc_pax(self, tarfile):
14791499

14801500
return next
14811501

1482-
def _proc_gnusparse_00(self, next, pax_headers, buf):
1502+
def _proc_gnusparse_00(self, next, raw_headers):
14831503
"""Process a GNU tar extended sparse header, version 0.0.
14841504
"""
14851505
offsets = []
1486-
for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1487-
offsets.append(int(match.group(1)))
14881506
numbytes = []
1489-
for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1490-
numbytes.append(int(match.group(1)))
1507+
for _, keyword, value in raw_headers:
1508+
if keyword == b"GNU.sparse.offset":
1509+
try:
1510+
offsets.append(int(value.decode()))
1511+
except ValueError:
1512+
raise InvalidHeaderError("invalid header")
1513+
1514+
elif keyword == b"GNU.sparse.numbytes":
1515+
try:
1516+
numbytes.append(int(value.decode()))
1517+
except ValueError:
1518+
raise InvalidHeaderError("invalid header")
1519+
14911520
next.sparse = list(zip(offsets, numbytes))
14921521

14931522
def _proc_gnusparse_01(self, next, pax_headers):

‎Lib/test/test_tarfile.py

Copy file name to clipboardExpand all lines: Lib/test/test_tarfile.py
+42Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,6 +1047,48 @@ def test_pax_number_fields(self):
10471047
finally:
10481048
tar.close()
10491049

1050+
def test_pax_header_bad_formats(self):
1051+
# The fields from the pax header have priority over the
1052+
# TarInfo.
1053+
pax_header_replacements = (
1054+
b" foo=bar\n",
1055+
b"0 \n",
1056+
b"1 \n",
1057+
b"2 \n",
1058+
b"3 =\n",
1059+
b"4 =a\n",
1060+
b"1000000 foo=bar\n",
1061+
b"0 foo=bar\n",
1062+
b"-12 foo=bar\n",
1063+
b"000000000000000000000000036 foo=bar\n",
1064+
)
1065+
pax_headers = {"foo": "bar"}
1066+
1067+
for replacement in pax_header_replacements:
1068+
with self.subTest(header=replacement):
1069+
tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT,
1070+
encoding="iso8859-1")
1071+
try:
1072+
t = tarfile.TarInfo()
1073+
t.name = "pax" # non-ASCII
1074+
t.uid = 1
1075+
t.pax_headers = pax_headers
1076+
tar.addfile(t)
1077+
finally:
1078+
tar.close()
1079+
1080+
with open(tmpname, "rb") as f:
1081+
data = f.read()
1082+
self.assertIn(b"11 foo=bar\n", data)
1083+
data = data.replace(b"11 foo=bar\n", replacement)
1084+
1085+
with open(tmpname, "wb") as f:
1086+
f.truncate()
1087+
f.write(data)
1088+
1089+
with self.assertRaisesRegex(tarfile.ReadError, r"file could not be opened successfully"):
1090+
tarfile.open(tmpname, encoding="iso8859-1")
1091+
10501092

10511093
class WriteTestBase(TarTest):
10521094
# Put all write tests in here that are supposed to be tested
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Remove backtracking from tarfile header parsing for ``hdrcharset``, PAX, and
2+
GNU sparse headers.

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.