Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 743acbe

Browse filesBrowse files
sethmlarsonEclips4gpshead
authored
[3.10] gh-121285: Remove backtracking when parsing tarfile headers (GH-121286) (#123640)
* Remove backtracking when parsing tarfile headers * Rewrite PAX header parsing to be stricter * Optimize parsing of GNU extended sparse headers v0.0 (cherry picked from commit 34ddb64) Co-authored-by: Kirill Podoprigora <kirill.bast9@mail.ru> Co-authored-by: Gregory P. Smith <greg@krypto.org>
1 parent e0264a6 commit 743acbe
Copy full SHA for 743acbe

File tree

3 files changed

+111
-38
lines changed
Filter options

3 files changed

+111
-38
lines changed

‎Lib/tarfile.py

Copy file name to clipboardExpand all lines: Lib/tarfile.py
+67-38Lines changed: 67 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -841,6 +841,9 @@ def data_filter(member, dest_path):
841841
# Sentinel for replace() defaults, meaning "don't change the attribute"
842842
_KEEP = object()
843843

844+
# Header length is digits followed by a space.
845+
_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ")
846+
844847
class TarInfo(object):
845848
"""Informational class which holds the details about an
846849
archive member given by a tar header block.
@@ -1410,59 +1413,76 @@ def _proc_pax(self, tarfile):
14101413
else:
14111414
pax_headers = tarfile.pax_headers.copy()
14121415

1413-
# Check if the pax header contains a hdrcharset field. This tells us
1414-
# the encoding of the path, linkpath, uname and gname fields. Normally,
1415-
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
1416-
# implementations are allowed to store them as raw binary strings if
1417-
# the translation to UTF-8 fails.
1418-
match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1419-
if match is not None:
1420-
pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1421-
1422-
# For the time being, we don't care about anything other than "BINARY".
1423-
# The only other value that is currently allowed by the standard is
1424-
# "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1425-
hdrcharset = pax_headers.get("hdrcharset")
1426-
if hdrcharset == "BINARY":
1427-
encoding = tarfile.encoding
1428-
else:
1429-
encoding = "utf-8"
1430-
14311416
# Parse pax header information. A record looks like that:
14321417
# "%d %s=%s\n" % (length, keyword, value). length is the size
14331418
# of the complete record including the length field itself and
1434-
# the newline. keyword and value are both UTF-8 encoded strings.
1435-
regex = re.compile(br"(\d+) ([^=]+)=")
1419+
# the newline.
14361420
pos = 0
1437-
while True:
1438-
match = regex.match(buf, pos)
1439-
if not match:
1440-
break
1421+
encoding = None
1422+
raw_headers = []
1423+
while len(buf) > pos and buf[pos] != 0x00:
1424+
if not (match := _header_length_prefix_re.match(buf, pos)):
1425+
raise InvalidHeaderError("invalid header")
1426+
try:
1427+
length = int(match.group(1))
1428+
except ValueError:
1429+
raise InvalidHeaderError("invalid header")
1430+
# Headers must be at least 5 bytes, shortest being '5 x=\n'.
1431+
# Value is allowed to be empty.
1432+
if length < 5:
1433+
raise InvalidHeaderError("invalid header")
1434+
if pos + length > len(buf):
1435+
raise InvalidHeaderError("invalid header")
14411436

1442-
length, keyword = match.groups()
1443-
length = int(length)
1444-
if length == 0:
1437+
header_value_end_offset = match.start(1) + length - 1 # Last byte of the header
1438+
keyword_and_value = buf[match.end(1) + 1:header_value_end_offset]
1439+
raw_keyword, equals, raw_value = keyword_and_value.partition(b"=")
1440+
1441+
# Check the framing of the header. The last character must be '\n' (0x0A)
1442+
if not raw_keyword or equals != b"=" or buf[header_value_end_offset] != 0x0A:
14451443
raise InvalidHeaderError("invalid header")
1446-
value = buf[match.end(2) + 1:match.start(1) + length - 1]
1444+
raw_headers.append((length, raw_keyword, raw_value))
1445+
1446+
# Check if the pax header contains a hdrcharset field. This tells us
1447+
# the encoding of the path, linkpath, uname and gname fields. Normally,
1448+
# these fields are UTF-8 encoded but since POSIX.1-2008 tar
1449+
# implementations are allowed to store them as raw binary strings if
1450+
# the translation to UTF-8 fails. For the time being, we don't care about
1451+
# anything other than "BINARY". The only other value that is currently
1452+
# allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1453+
# Note that we only follow the initial 'hdrcharset' setting to preserve
1454+
# the initial behavior of the 'tarfile' module.
1455+
if raw_keyword == b"hdrcharset" and encoding is None:
1456+
if raw_value == b"BINARY":
1457+
encoding = tarfile.encoding
1458+
else: # This branch ensures only the first 'hdrcharset' header is used.
1459+
encoding = "utf-8"
1460+
1461+
pos += length
14471462

1463+
# If no explicit hdrcharset is set, we use UTF-8 as a default.
1464+
if encoding is None:
1465+
encoding = "utf-8"
1466+
1467+
# After parsing the raw headers we can decode them to text.
1468+
for length, raw_keyword, raw_value in raw_headers:
14481469
# Normally, we could just use "utf-8" as the encoding and "strict"
14491470
# as the error handler, but we better not take the risk. For
14501471
# example, GNU tar <= 1.23 is known to store filenames it cannot
14511472
# translate to UTF-8 as raw strings (unfortunately without a
14521473
# hdrcharset=BINARY header).
14531474
# We first try the strict standard encoding, and if that fails we
14541475
# fall back on the user's encoding and error handler.
1455-
keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1476+
keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8",
14561477
tarfile.errors)
14571478
if keyword in PAX_NAME_FIELDS:
1458-
value = self._decode_pax_field(value, encoding, tarfile.encoding,
1479+
value = self._decode_pax_field(raw_value, encoding, tarfile.encoding,
14591480
tarfile.errors)
14601481
else:
1461-
value = self._decode_pax_field(value, "utf-8", "utf-8",
1482+
value = self._decode_pax_field(raw_value, "utf-8", "utf-8",
14621483
tarfile.errors)
14631484

14641485
pax_headers[keyword] = value
1465-
pos += length
14661486

14671487
# Fetch the next header.
14681488
try:
@@ -1477,7 +1497,7 @@ def _proc_pax(self, tarfile):
14771497

14781498
elif "GNU.sparse.size" in pax_headers:
14791499
# GNU extended sparse format version 0.0.
1480-
self._proc_gnusparse_00(next, pax_headers, buf)
1500+
self._proc_gnusparse_00(next, raw_headers)
14811501

14821502
elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
14831503
# GNU extended sparse format version 1.0.
@@ -1499,15 +1519,24 @@ def _proc_pax(self, tarfile):
14991519

15001520
return next
15011521

1502-
def _proc_gnusparse_00(self, next, pax_headers, buf):
1522+
def _proc_gnusparse_00(self, next, raw_headers):
15031523
"""Process a GNU tar extended sparse header, version 0.0.
15041524
"""
15051525
offsets = []
1506-
for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1507-
offsets.append(int(match.group(1)))
15081526
numbytes = []
1509-
for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1510-
numbytes.append(int(match.group(1)))
1527+
for _, keyword, value in raw_headers:
1528+
if keyword == b"GNU.sparse.offset":
1529+
try:
1530+
offsets.append(int(value.decode()))
1531+
except ValueError:
1532+
raise InvalidHeaderError("invalid header")
1533+
1534+
elif keyword == b"GNU.sparse.numbytes":
1535+
try:
1536+
numbytes.append(int(value.decode()))
1537+
except ValueError:
1538+
raise InvalidHeaderError("invalid header")
1539+
15111540
next.sparse = list(zip(offsets, numbytes))
15121541

15131542
def _proc_gnusparse_01(self, next, pax_headers):

‎Lib/test/test_tarfile.py

Copy file name to clipboardExpand all lines: Lib/test/test_tarfile.py
+42Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,48 @@ def test_pax_number_fields(self):
11391139
finally:
11401140
tar.close()
11411141

1142+
def test_pax_header_bad_formats(self):
1143+
# The fields from the pax header have priority over the
1144+
# TarInfo.
1145+
pax_header_replacements = (
1146+
b" foo=bar\n",
1147+
b"0 \n",
1148+
b"1 \n",
1149+
b"2 \n",
1150+
b"3 =\n",
1151+
b"4 =a\n",
1152+
b"1000000 foo=bar\n",
1153+
b"0 foo=bar\n",
1154+
b"-12 foo=bar\n",
1155+
b"000000000000000000000000036 foo=bar\n",
1156+
)
1157+
pax_headers = {"foo": "bar"}
1158+
1159+
for replacement in pax_header_replacements:
1160+
with self.subTest(header=replacement):
1161+
tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT,
1162+
encoding="iso8859-1")
1163+
try:
1164+
t = tarfile.TarInfo()
1165+
t.name = "pax" # non-ASCII
1166+
t.uid = 1
1167+
t.pax_headers = pax_headers
1168+
tar.addfile(t)
1169+
finally:
1170+
tar.close()
1171+
1172+
with open(tmpname, "rb") as f:
1173+
data = f.read()
1174+
self.assertIn(b"11 foo=bar\n", data)
1175+
data = data.replace(b"11 foo=bar\n", replacement)
1176+
1177+
with open(tmpname, "wb") as f:
1178+
f.truncate()
1179+
f.write(data)
1180+
1181+
with self.assertRaisesRegex(tarfile.ReadError, r"method tar: ReadError\('invalid header'\)"):
1182+
tarfile.open(tmpname, encoding="iso8859-1")
1183+
11421184

11431185
class WriteTestBase(TarTest):
11441186
# Put all write tests in here that are supposed to be tested
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Remove backtracking from tarfile header parsing for ``hdrcharset``, PAX, and
2+
GNU sparse headers.

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.