From 1256651ec3b5554d2f18a9b37d3e5a2976e0ddcd Mon Sep 17 00:00:00 2001 From: Noa Date: Fri, 18 Apr 2025 13:42:41 -0500 Subject: [PATCH 1/2] Update tarfile to 3.12.3 --- Lib/tarfile.py | 436 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 361 insertions(+), 75 deletions(-) diff --git a/Lib/tarfile.py b/Lib/tarfile.py index dea150e8db..3bbbcaa621 100755 --- a/Lib/tarfile.py +++ b/Lib/tarfile.py @@ -46,6 +46,7 @@ import struct import copy import re +import warnings try: import pwd @@ -57,19 +58,19 @@ grp = None # os.symlink on Windows prior to 6.0 raises NotImplementedError -symlink_exception = (AttributeError, NotImplementedError) -try: - # OSError (winerror=1314) will be raised if the caller does not hold the - # SeCreateSymbolicLinkPrivilege privilege - symlink_exception += (OSError,) -except NameError: - pass +# OSError (winerror=1314) will be raised if the caller does not hold the +# SeCreateSymbolicLinkPrivilege privilege +symlink_exception = (AttributeError, NotImplementedError, OSError) # from tarfile import * __all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError", "CompressionError", "StreamError", "ExtractError", "HeaderError", "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT", - "DEFAULT_FORMAT", "open"] + "DEFAULT_FORMAT", "open","fully_trusted_filter", "data_filter", + "tar_filter", "FilterError", "AbsoluteLinkError", + "OutsideDestinationError", "SpecialFileError", "AbsolutePathError", + "LinkOutsideDestinationError"] + #--------------------------------------------------------- # tar constants @@ -158,6 +159,8 @@ def stn(s, length, encoding, errors): """Convert a string to a null-terminated bytes object. """ + if s is None: + raise ValueError("metadata cannot contain None") s = s.encode(encoding, errors) return s[:length] + (length - len(s)) * NUL @@ -328,15 +331,17 @@ def write(self, s): class _Stream: """Class that serves as an adapter between TarFile and a stream-like object. The stream-like object only - needs to have a read() or write() method and is accessed - blockwise. Use of gzip or bzip2 compression is possible. - A stream-like object could be for example: sys.stdin, - sys.stdout, a socket, a tape device etc. + needs to have a read() or write() method that works with bytes, + and the method is accessed blockwise. + Use of gzip or bzip2 compression is possible. + A stream-like object could be for example: sys.stdin.buffer, + sys.stdout.buffer, a socket, a tape device etc. _Stream is intended to be used only internally. """ - def __init__(self, name, mode, comptype, fileobj, bufsize): + def __init__(self, name, mode, comptype, fileobj, bufsize, + compresslevel): """Construct a _Stream object. """ self._extfileobj = True @@ -368,10 +373,10 @@ def __init__(self, name, mode, comptype, fileobj, bufsize): self.zlib = zlib self.crc = zlib.crc32(b"") if mode == "r": - self._init_read_gz() self.exception = zlib.error + self._init_read_gz() else: - self._init_write_gz() + self._init_write_gz(compresslevel) elif comptype == "bz2": try: @@ -383,7 +388,7 @@ def __init__(self, name, mode, comptype, fileobj, bufsize): self.cmp = bz2.BZ2Decompressor() self.exception = OSError else: - self.cmp = bz2.BZ2Compressor() + self.cmp = bz2.BZ2Compressor(compresslevel) elif comptype == "xz": try: @@ -410,13 +415,14 @@ def __del__(self): if hasattr(self, "closed") and not self.closed: self.close() - def _init_write_gz(self): + def _init_write_gz(self, compresslevel): """Initialize for writing with gzip compression. """ - self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED, - -self.zlib.MAX_WBITS, - self.zlib.DEF_MEM_LEVEL, - 0) + self.cmp = self.zlib.compressobj(compresslevel, + self.zlib.DEFLATED, + -self.zlib.MAX_WBITS, + self.zlib.DEF_MEM_LEVEL, + 0) timestamp = struct.pack("" % (self.__class__.__name__,self.name,id(self)) + def replace(self, *, + name=_KEEP, mtime=_KEEP, mode=_KEEP, linkname=_KEEP, + uid=_KEEP, gid=_KEEP, uname=_KEEP, gname=_KEEP, + deep=True, _KEEP=_KEEP): + """Return a deep copy of self with the given attributes replaced. + """ + if deep: + result = copy.deepcopy(self) + else: + result = copy.copy(self) + if name is not _KEEP: + result.name = name + if mtime is not _KEEP: + result.mtime = mtime + if mode is not _KEEP: + result.mode = mode + if linkname is not _KEEP: + result.linkname = linkname + if uid is not _KEEP: + result.uid = uid + if gid is not _KEEP: + result.gid = gid + if uname is not _KEEP: + result.uname = uname + if gname is not _KEEP: + result.gname = gname + return result + def get_info(self): """Return the TarInfo's attributes as a dictionary. """ + if self.mode is None: + mode = None + else: + mode = self.mode & 0o7777 info = { "name": self.name, - "mode": self.mode & 0o7777, + "mode": mode, "uid": self.uid, "gid": self.gid, "size": self.size, @@ -820,6 +983,9 @@ def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescap """Return a tar header as a string of 512 byte blocks. """ info = self.get_info() + for name, value in info.items(): + if value is None: + raise ValueError("%s may not be None" % name) if format == USTAR_FORMAT: return self.create_ustar_header(info, encoding, errors) @@ -950,6 +1116,12 @@ def _create_header(info, format, encoding, errors): devmajor = stn("", 8, encoding, errors) devminor = stn("", 8, encoding, errors) + # None values in metadata should cause ValueError. + # itn()/stn() do this for all fields except type. + filetype = info.get("type", REGTYPE) + if filetype is None: + raise ValueError("TarInfo.type must not be None") + parts = [ stn(info.get("name", ""), 100, encoding, errors), itn(info.get("mode", 0) & 0o7777, 8, format), @@ -958,7 +1130,7 @@ def _create_header(info, format, encoding, errors): itn(info.get("size", 0), 12, format), itn(info.get("mtime", 0), 12, format), b" ", # checksum field - info.get("type", REGTYPE), + filetype, stn(info.get("linkname", ""), 100, encoding, errors), info.get("magic", POSIX_MAGIC), stn(info.get("uname", ""), 32, encoding, errors), @@ -1264,11 +1436,7 @@ def _proc_pax(self, tarfile): # the newline. keyword and value are both UTF-8 encoded strings. regex = re.compile(br"(\d+) ([^=]+)=") pos = 0 - while True: - match = regex.match(buf, pos) - if not match: - break - + while match := regex.match(buf, pos): length, keyword = match.groups() length = int(length) if length == 0: @@ -1468,6 +1636,8 @@ class TarFile(object): fileobject = ExFileObject # The file-object for extractfile(). + extraction_filter = None # The default filter for extraction. + def __init__(self, name=None, mode="r", fileobj=None, format=None, tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, errors="surrogateescape", pax_headers=None, debug=None, @@ -1659,7 +1829,9 @@ def not_compressed(comptype): if filemode not in ("r", "w"): raise ValueError("mode must be 'r' or 'w'") - stream = _Stream(name, filemode, comptype, fileobj, bufsize) + compresslevel = kwargs.pop("compresslevel", 9) + stream = _Stream(name, filemode, comptype, fileobj, bufsize, + compresslevel) try: t = cls(name, filemode, stream, **kwargs) except: @@ -1940,7 +2112,10 @@ def list(self, verbose=True, *, members=None): members = self for tarinfo in members: if verbose: - _safe_print(stat.filemode(tarinfo.mode)) + if tarinfo.mode is None: + _safe_print("??????????") + else: + _safe_print(stat.filemode(tarinfo.mode)) _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid, tarinfo.gname or tarinfo.gid)) if tarinfo.ischr() or tarinfo.isblk(): @@ -1948,8 +2123,11 @@ def list(self, verbose=True, *, members=None): ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor))) else: _safe_print("%10d" % tarinfo.size) - _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ - % time.localtime(tarinfo.mtime)[:6]) + if tarinfo.mtime is None: + _safe_print("????-??-?? ??:??:??") + else: + _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ + % time.localtime(tarinfo.mtime)[:6]) _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else "")) @@ -2036,32 +2214,63 @@ def addfile(self, tarinfo, fileobj=None): self.members.append(tarinfo) - def extractall(self, path=".", members=None, *, numeric_owner=False): + def _get_filter_function(self, filter): + if filter is None: + filter = self.extraction_filter + if filter is None: + warnings.warn( + 'Python 3.14 will, by default, filter extracted tar ' + + 'archives and reject files or modify their metadata. ' + + 'Use the filter argument to control this behavior.', + DeprecationWarning) + return fully_trusted_filter + if isinstance(filter, str): + raise TypeError( + 'String names are not supported for ' + + 'TarFile.extraction_filter. Use a function such as ' + + 'tarfile.data_filter directly.') + return filter + if callable(filter): + return filter + try: + return _NAMED_FILTERS[filter] + except KeyError: + raise ValueError(f"filter {filter!r} not found") from None + + def extractall(self, path=".", members=None, *, numeric_owner=False, + filter=None): """Extract all members from the archive to the current working directory and set owner, modification time and permissions on directories afterwards. `path' specifies a different directory to extract to. `members' is optional and must be a subset of the list returned by getmembers(). If `numeric_owner` is True, only the numbers for user/group names are used and not the names. + + The `filter` function will be called on each member just + before extraction. + It can return a changed TarInfo or None to skip the member. + String names of common filters are accepted. """ directories = [] + filter_function = self._get_filter_function(filter) if members is None: members = self - for tarinfo in members: + for member in members: + tarinfo = self._get_extract_tarinfo(member, filter_function, path) + if tarinfo is None: + continue if tarinfo.isdir(): - # Extract directories with a safe mode. + # For directories, delay setting attributes until later, + # since permissions can interfere with extraction and + # extracting contents can reset mtime. directories.append(tarinfo) - tarinfo = copy.copy(tarinfo) - tarinfo.mode = 0o700 - # Do not set_attrs directories, as we will do that further down - self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), - numeric_owner=numeric_owner) + self._extract_one(tarinfo, path, set_attrs=not tarinfo.isdir(), + numeric_owner=numeric_owner) # Reverse sort directories. - directories.sort(key=lambda a: a.name) - directories.reverse() + directories.sort(key=lambda a: a.name, reverse=True) # Set correct owner, mtime and filemode on directories. for tarinfo in directories: @@ -2071,12 +2280,10 @@ def extractall(self, path=".", members=None, *, numeric_owner=False): self.utime(tarinfo, dirpath) self.chmod(tarinfo, dirpath) except ExtractError as e: - if self.errorlevel > 1: - raise - else: - self._dbg(1, "tarfile: %s" % e) + self._handle_nonfatal_error(e) - def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): + def extract(self, member, path="", set_attrs=True, *, numeric_owner=False, + filter=None): """Extract a member from the archive to the current working directory, using its full name. Its file information is extracted as accurately as possible. `member' may be a filename or a TarInfo object. You can @@ -2084,35 +2291,70 @@ def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): mtime, mode) are set unless `set_attrs' is False. If `numeric_owner` is True, only the numbers for user/group names are used and not the names. + + The `filter` function will be called before extraction. + It can return a changed TarInfo or None to skip the member. + String names of common filters are accepted. """ - self._check("r") + filter_function = self._get_filter_function(filter) + tarinfo = self._get_extract_tarinfo(member, filter_function, path) + if tarinfo is not None: + self._extract_one(tarinfo, path, set_attrs, numeric_owner) + def _get_extract_tarinfo(self, member, filter_function, path): + """Get filtered TarInfo (or None) from member, which might be a str""" if isinstance(member, str): tarinfo = self.getmember(member) else: tarinfo = member + unfiltered = tarinfo + try: + tarinfo = filter_function(tarinfo, path) + except (OSError, FilterError) as e: + self._handle_fatal_error(e) + except ExtractError as e: + self._handle_nonfatal_error(e) + if tarinfo is None: + self._dbg(2, "tarfile: Excluded %r" % unfiltered.name) + return None # Prepare the link target for makelink(). if tarinfo.islnk(): + tarinfo = copy.copy(tarinfo) tarinfo._link_target = os.path.join(path, tarinfo.linkname) + return tarinfo + + def _extract_one(self, tarinfo, path, set_attrs, numeric_owner): + """Extract from filtered tarinfo to disk""" + self._check("r") try: self._extract_member(tarinfo, os.path.join(path, tarinfo.name), set_attrs=set_attrs, numeric_owner=numeric_owner) except OSError as e: - if self.errorlevel > 0: - raise - else: - if e.filename is None: - self._dbg(1, "tarfile: %s" % e.strerror) - else: - self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) + self._handle_fatal_error(e) except ExtractError as e: - if self.errorlevel > 1: - raise + self._handle_nonfatal_error(e) + + def _handle_nonfatal_error(self, e): + """Handle non-fatal error (ExtractError) according to errorlevel""" + if self.errorlevel > 1: + raise + else: + self._dbg(1, "tarfile: %s" % e) + + def _handle_fatal_error(self, e): + """Handle "fatal" error according to self.errorlevel""" + if self.errorlevel > 0: + raise + elif isinstance(e, OSError): + if e.filename is None: + self._dbg(1, "tarfile: %s" % e.strerror) else: - self._dbg(1, "tarfile: %s" % e) + self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) + else: + self._dbg(1, "tarfile: %s %s" % (type(e).__name__, e)) def extractfile(self, member): """Extract a member from the archive as a file object. `member' may be @@ -2199,11 +2441,16 @@ def makedir(self, tarinfo, targetpath): """Make a directory called targetpath. """ try: - # Use a safe mode for the directory, the real mode is set - # later in _extract_member(). - os.mkdir(targetpath, 0o700) + if tarinfo.mode is None: + # Use the system's default mode + os.mkdir(targetpath) + else: + # Use a safe mode for the directory, the real mode is set + # later in _extract_member(). + os.mkdir(targetpath, 0o700) except FileExistsError: - pass + if not os.path.isdir(targetpath): + raise def makefile(self, tarinfo, targetpath): """Make a file called targetpath. @@ -2244,6 +2491,9 @@ def makedev(self, tarinfo, targetpath): raise ExtractError("special devices not supported by system") mode = tarinfo.mode + if mode is None: + # Use mknod's default + mode = 0o600 if tarinfo.isblk(): mode |= stat.S_IFBLK else: @@ -2265,7 +2515,6 @@ def makelink(self, tarinfo, targetpath): os.unlink(targetpath) os.symlink(tarinfo.linkname, targetpath) else: - # See extract(). if os.path.exists(tarinfo._link_target): os.link(tarinfo._link_target, targetpath) else: @@ -2290,15 +2539,19 @@ def chown(self, tarinfo, targetpath, numeric_owner): u = tarinfo.uid if not numeric_owner: try: - if grp: + if grp and tarinfo.gname: g = grp.getgrnam(tarinfo.gname)[2] except KeyError: pass try: - if pwd: + if pwd and tarinfo.uname: u = pwd.getpwnam(tarinfo.uname)[2] except KeyError: pass + if g is None: + g = -1 + if u is None: + u = -1 try: if tarinfo.issym() and hasattr(os, "lchown"): os.lchown(targetpath, u, g) @@ -2310,6 +2563,8 @@ def chown(self, tarinfo, targetpath, numeric_owner): def chmod(self, tarinfo, targetpath): """Set file permissions of targetpath according to tarinfo. """ + if tarinfo.mode is None: + return try: os.chmod(targetpath, tarinfo.mode) except OSError as e: @@ -2318,10 +2573,13 @@ def chmod(self, tarinfo, targetpath): def utime(self, tarinfo, targetpath): """Set modification time of targetpath according to tarinfo. """ + mtime = tarinfo.mtime + if mtime is None: + return if not hasattr(os, 'utime'): return try: - os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) + os.utime(targetpath, (mtime, mtime)) except OSError as e: raise ExtractError("could not change modification time") from e @@ -2339,6 +2597,8 @@ def next(self): # Advance the file pointer. if self.offset != self.fileobj.tell(): + if self.offset == 0: + return None self.fileobj.seek(self.offset - 1) if not self.fileobj.read(1): raise ReadError("unexpected end of data") @@ -2397,13 +2657,26 @@ def _getmember(self, name, tarinfo=None, normalize=False): members = self.getmembers() # Limit the member search list up to tarinfo. + skipping = False if tarinfo is not None: - members = members[:members.index(tarinfo)] + try: + index = members.index(tarinfo) + except ValueError: + # The given starting point might be a (modified) copy. + # We'll later skip members until we find an equivalent. + skipping = True + else: + # Happy fast path + members = members[:index] if normalize: name = os.path.normpath(name) for member in reversed(members): + if skipping: + if tarinfo.offset == member.offset: + skipping = False + continue if normalize: member_name = os.path.normpath(member.name) else: @@ -2412,14 +2685,16 @@ def _getmember(self, name, tarinfo=None, normalize=False): if name == member_name: return member + if skipping: + # Starting point was not found + raise ValueError(tarinfo) + def _load(self): """Read through the entire archive file and look for readable members. """ - while True: - tarinfo = self.next() - if tarinfo is None: - break + while self.next() is not None: + pass self._loaded = True def _check(self, mode=None): @@ -2504,6 +2779,7 @@ def __exit__(self, type, value, traceback): #-------------------- # exported functions #-------------------- + def is_tarfile(name): """Return True if name points to a tar archive that we are able to handle, else return False. @@ -2512,7 +2788,9 @@ def is_tarfile(name): """ try: if hasattr(name, "read"): + pos = name.tell() t = open(fileobj=name) + name.seek(pos) else: t = open(name) t.close() @@ -2530,6 +2808,10 @@ def main(): parser = argparse.ArgumentParser(description=description) parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Verbose output') + parser.add_argument('--filter', metavar='', + choices=_NAMED_FILTERS, + help='Filter for extraction') + group = parser.add_mutually_exclusive_group(required=True) group.add_argument('-l', '--list', metavar='', help='Show listing of a tarfile') @@ -2541,8 +2823,12 @@ def main(): help='Create tarfile from sources') group.add_argument('-t', '--test', metavar='', help='Test if a tarfile is valid') + args = parser.parse_args() + if args.filter and args.extract is None: + parser.exit(1, '--filter is only valid for extraction\n') + if args.test is not None: src = args.test if is_tarfile(src): @@ -2573,7 +2859,7 @@ def main(): if is_tarfile(src): with TarFile.open(src, 'r:*') as tf: - tf.extractall(path=curdir) + tf.extractall(path=curdir, filter=args.filter) if args.verbose: if curdir == '.': msg = '{!r} file is extracted.'.format(src) From d6f468e9008c04f768e2c20358f6433038b1fe23 Mon Sep 17 00:00:00 2001 From: Noa Date: Fri, 18 Apr 2025 13:42:54 -0500 Subject: [PATCH 2/2] Unmark tests --- Lib/test/test_shutil.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Lib/test/test_shutil.py b/Lib/test/test_shutil.py index 16416547c1..b64ccb37a5 100644 --- a/Lib/test/test_shutil.py +++ b/Lib/test/test_shutil.py @@ -2000,13 +2000,9 @@ def check_unpack_tarball(self, format): ('Python 3.14', DeprecationWarning)): self.check_unpack_archive(format) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_unpack_archive_tar(self): self.check_unpack_tarball('tar') - # TODO: RUSTPYTHON - @unittest.expectedFailure @support.requires_zlib() def test_unpack_archive_gztar(self): self.check_unpack_tarball('gztar')