diff --git a/Lib/test/test_zipfile/_path/test_path.py b/Lib/test/test_zipfile/_path/test_path.py index 2d1c06cd968..5c69c77f7d8 100644 --- a/Lib/test/test_zipfile/_path/test_path.py +++ b/Lib/test/test_zipfile/_path/test_path.py @@ -275,7 +275,8 @@ def test_pathlike_construction(self, alpharep): """ zipfile_ondisk = self.zipfile_ondisk(alpharep) pathlike = FakePath(str(zipfile_ondisk)) - zipfile.Path(pathlike) + root = zipfile.Path(pathlike) + root.root.close() @pass_alpharep def test_traverse_pathlike(self, alpharep): @@ -374,6 +375,7 @@ def test_root_on_disk(self, alpharep): root = zipfile.Path(self.zipfile_ondisk(alpharep)) assert root.name == 'alpharep.zip' == root.filename.name assert root.stem == 'alpharep' == root.filename.stem + root.root.close() @pass_alpharep def test_suffix(self, alpharep): @@ -565,7 +567,7 @@ def test_inheritance(self, alpharep): file = cls(alpharep).joinpath('some dir').parent assert isinstance(file, cls) - @unittest.skipIf(sys.platform == 'win32', "TODO: RUSTPYTHON, fails on Windows") + @unittest.skipIf(sys.platform == 'win32', 'TODO: RUSTPYTHON; fails on Windows') @parameterize( ['alpharep', 'path_type', 'subpath'], itertools.product( @@ -576,11 +578,13 @@ def test_inheritance(self, alpharep): ) def test_pickle(self, alpharep, path_type, subpath): zipfile_ondisk = path_type(str(self.zipfile_ondisk(alpharep))) - - saved_1 = pickle.dumps(zipfile.Path(zipfile_ondisk, at=subpath)) + root = zipfile.Path(zipfile_ondisk, at=subpath) + saved_1 = pickle.dumps(root) + root.root.close() restored_1 = pickle.loads(saved_1) first, *rest = restored_1.iterdir() assert first.read_text(encoding='utf-8').startswith('content of ') + restored_1.root.close() @pass_alpharep def test_extract_orig_with_implied_dirs(self, alpharep): @@ -592,6 +596,7 @@ def test_extract_orig_with_implied_dirs(self, alpharep): # wrap the zipfile for its side effect zipfile.Path(zf) zf.extractall(source_path.parent) + zf.close() @pass_alpharep def test_getinfo_missing(self, alpharep): diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py index fa1feef00cd..63413d7b944 100644 --- a/Lib/test/test_zipfile/test_core.py +++ b/Lib/test/test_zipfile/test_core.py @@ -302,26 +302,26 @@ def test_low_compression(self): self.assertEqual(openobj.read(1), b'2') def test_writestr_compression(self): - zipfp = zipfile.ZipFile(TESTFN2, "w") - zipfp.writestr("b.txt", "hello world", compress_type=self.compression) - info = zipfp.getinfo('b.txt') - self.assertEqual(info.compress_type, self.compression) + with zipfile.ZipFile(TESTFN2, "w") as zipfp: + zipfp.writestr("b.txt", "hello world", compress_type=self.compression) + info = zipfp.getinfo('b.txt') + self.assertEqual(info.compress_type, self.compression) def test_writestr_compresslevel(self): - zipfp = zipfile.ZipFile(TESTFN2, "w", compresslevel=1) - zipfp.writestr("a.txt", "hello world", compress_type=self.compression) - zipfp.writestr("b.txt", "hello world", compress_type=self.compression, - compresslevel=2) + with zipfile.ZipFile(TESTFN2, "w", compresslevel=1) as zipfp: + zipfp.writestr("a.txt", "hello world", compress_type=self.compression) + zipfp.writestr("b.txt", "hello world", compress_type=self.compression, + compresslevel=2) - # Compression level follows the constructor. - a_info = zipfp.getinfo('a.txt') - self.assertEqual(a_info.compress_type, self.compression) - self.assertEqual(a_info.compress_level, 1) + # Compression level follows the constructor. + a_info = zipfp.getinfo('a.txt') + self.assertEqual(a_info.compress_type, self.compression) + self.assertEqual(a_info.compress_level, 1) - # Compression level is overridden. - b_info = zipfp.getinfo('b.txt') - self.assertEqual(b_info.compress_type, self.compression) - self.assertEqual(b_info._compresslevel, 2) + # Compression level is overridden. + b_info = zipfp.getinfo('b.txt') + self.assertEqual(b_info.compress_type, self.compression) + self.assertEqual(b_info._compresslevel, 2) def test_read_return_size(self): # Issue #9837: ZipExtFile.read() shouldn't return more bytes @@ -884,6 +884,8 @@ def make_zip64_file( self, file_size_64_set=False, file_size_extra=False, compress_size_64_set=False, compress_size_extra=False, header_offset_64_set=False, header_offset_extra=False, + extensible_data=b'', + end_of_central_dir_size=None, offset_to_end_of_central_dir=None, ): """Generate bytes sequence for a zip with (incomplete) zip64 data. @@ -937,6 +939,12 @@ def make_zip64_file( central_dir_size = struct.pack('", "exec"), NOW, len(src)) - files = {TESTMOD + pyc_ext: (NOW, pyc), - "some.data": (NOW, "some data")} - self.doTest(pyc_ext, files, TESTMOD) + files = {TESTMOD + pyc_ext: pyc, + "some.data": "some data"} + self.doTest(pyc_ext, files, TESTMOD, prefix='') def testDefaultOptimizationLevel(self): # zipimport should use the default optimization level (#28131) @@ -648,17 +667,20 @@ def testDefaultOptimizationLevel(self): def test(val): assert(val) return val\n""" - files = {TESTMOD + '.py': (NOW, src)} + files = {TESTMOD + '.py': src} self.makeZip(files) sys.path.insert(0, TEMP_ZIP) mod = importlib.import_module(TESTMOD) self.assertEqual(mod.test(1), 1) - self.assertRaises(AssertionError, mod.test, False) + if __debug__: + self.assertRaises(AssertionError, mod.test, False) + else: + self.assertEqual(mod.test(0), 0) def testImport_WithStuff(self): # try importing from a zipfile which contains additional # stuff at the beginning of the file - files = {TESTMOD + ".py": (NOW, test_src)} + files = {TESTMOD + ".py": test_src} self.doTest(".py", files, TESTMOD, stuff=b"Some Stuff"*31) @@ -666,18 +688,18 @@ def assertModuleSource(self, module): self.assertEqual(inspect.getsource(module), test_src) def testGetSource(self): - files = {TESTMOD + ".py": (NOW, test_src)} + files = {TESTMOD + ".py": test_src} self.doTest(".py", files, TESTMOD, call=self.assertModuleSource) def testGetCompiledSource(self): pyc = make_pyc(compile(test_src, "", "exec"), NOW, len(test_src)) - files = {TESTMOD + ".py": (NOW, test_src), - TESTMOD + pyc_ext: (NOW, pyc)} + files = {TESTMOD + ".py": test_src, + TESTMOD + pyc_ext: pyc} self.doTest(pyc_ext, files, TESTMOD, call=self.assertModuleSource) def runDoctest(self, callback): - files = {TESTMOD + ".py": (NOW, test_src), - "xyz.txt": (NOW, ">>> log.append(True)\n")} + files = {TESTMOD + ".py": test_src, + "xyz.txt": ">>> log.append(True)\n"} self.doTest(".py", files, TESTMOD, call=callback) def doDoctestFile(self, module): @@ -720,56 +742,177 @@ def doTraceback(self, module): s = io.StringIO() print_tb(tb, 1, s) - self.assertTrue(s.getvalue().endswith(raise_src)) + self.assertTrue(s.getvalue().endswith( + ' def do_raise(): raise TypeError\n' + '' if support.has_no_debug_ranges() else + ' ^^^^^^^^^^^^^^^\n' + )) else: raise AssertionError("This ought to be impossible") - # TODO: RUSTPYTHON; empty caret lines from equal col/end_col - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; empty caret lines from equal col/end_col def testTraceback(self): - files = {TESTMOD + ".py": (NOW, raise_src)} + files = {TESTMOD + ".py": raise_src} self.doTest(None, files, TESTMOD, call=self.doTraceback) @unittest.skipIf(os_helper.TESTFN_UNENCODABLE is None, "need an unencodable filename") def testUnencodable(self): filename = os_helper.TESTFN_UNENCODABLE + ".zip" - self.addCleanup(os_helper.unlink, filename) - with ZipFile(filename, "w") as z: - zinfo = ZipInfo(TESTMOD + ".py", time.localtime(NOW)) - zinfo.compress_type = self.compression - z.writestr(zinfo, test_src) + self.makeZip({TESTMOD + ".py": test_src}, filename) spec = zipimport.zipimporter(filename).find_spec(TESTMOD) mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) def testBytesPath(self): filename = os_helper.TESTFN + ".zip" - self.addCleanup(os_helper.unlink, filename) - with ZipFile(filename, "w") as z: - zinfo = ZipInfo(TESTMOD + ".py", time.localtime(NOW)) - zinfo.compress_type = self.compression - z.writestr(zinfo, test_src) + self.makeZip({TESTMOD + ".py": test_src}, filename) zipimport.zipimporter(filename) - zipimport.zipimporter(os.fsencode(filename)) + with self.assertRaises(TypeError): + zipimport.zipimporter(os.fsencode(filename)) with self.assertRaises(TypeError): zipimport.zipimporter(bytearray(os.fsencode(filename))) with self.assertRaises(TypeError): zipimport.zipimporter(memoryview(os.fsencode(filename))) def testComment(self): - files = {TESTMOD + ".py": (NOW, test_src)} + files = {TESTMOD + ".py": test_src} self.doTest(".py", files, TESTMOD, comment=b"comment") def testBeginningCruftAndComment(self): - files = {TESTMOD + ".py": (NOW, test_src)} + files = {TESTMOD + ".py": test_src} self.doTest(".py", files, TESTMOD, stuff=b"cruft" * 64, comment=b"hi") def testLargestPossibleComment(self): - files = {TESTMOD + ".py": (NOW, test_src)} + files = {TESTMOD + ".py": test_src} self.doTest(".py", files, TESTMOD, comment=b"c" * ((1 << 16) - 1)) + def testZip64(self): + files = self.getZip64Files() + self.doTest(".py", files, "f6") + + def testZip64CruftAndComment(self): + files = self.getZip64Files() + self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1)) + + @unittest.skip('TODO: RUSTPYTHON; (intermittent success/failures); ValueError: name="RustPython/crates/pylib/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part" does not fit expected pattern.') + def testZip64LargeFile(self): + support.requires( + "largefile", + f"test generates files >{0xFFFFFFFF} bytes and takes a long time " + "to run" + ) + + # N.B.: We do a lot of gymnastics below in the ZIP_STORED case to save + # and reconstruct a sparse zip on systems that support sparse files. + # Instead of creating a ~8GB zip file mainly consisting of null bytes + # for every run of the test, we create the zip once and save off the + # non-null portions of the resulting file as data blobs with offsets + # that allow re-creating the zip file sparsely. This drops disk space + # usage to ~9KB for the ZIP_STORED case and drops that test time by ~2 + # orders of magnitude. For the ZIP_DEFLATED case, however, we bite the + # bullet. The resulting zip file is ~8MB of non-null data; so the sparse + # trick doesn't work and would result in that full ~8MB zip data file + # being checked in to source control. + parts_glob = f"sparse-zip64-c{self.compression:d}-0x*.part" + full_parts_glob = os.path.join(TEST_DATA_DIR, parts_glob) + pre_built_zip_parts = glob.glob(full_parts_glob) + + self.addCleanup(os_helper.unlink, TEMP_ZIP) + if not pre_built_zip_parts: + if self.compression != ZIP_STORED: + support.requires( + "cpu", + "test requires a lot of CPU for compression." + ) + self.addCleanup(os_helper.unlink, os_helper.TESTFN) + with open(os_helper.TESTFN, "wb") as f: + f.write(b"data") + f.write(os.linesep.encode()) + f.seek(0xffff_ffff, os.SEEK_CUR) + f.write(os.linesep.encode()) + os.utime(os_helper.TESTFN, (0.0, 0.0)) + with ZipFile( + TEMP_ZIP, + "w", + compression=self.compression, + strict_timestamps=False + ) as z: + z.write(os_helper.TESTFN, "data1") + z.writestr( + ZipInfo("module.py", (1980, 1, 1, 0, 0, 0)), test_src + ) + z.write(os_helper.TESTFN, "data2") + + # This "works" but relies on the zip format having a non-empty + # final page due to the trailing central directory to wind up with + # the correct length file. + def make_sparse_zip_parts(name): + empty_page = b"\0" * 4096 + with open(name, "rb") as f: + part = None + try: + while True: + offset = f.tell() + data = f.read(len(empty_page)) + if not data: + break + if data != empty_page: + if not part: + part_fullname = os.path.join( + TEST_DATA_DIR, + f"sparse-zip64-c{self.compression:d}-" + f"{offset:#011x}.part", + ) + os.makedirs( + os.path.dirname(part_fullname), + exist_ok=True + ) + part = open(part_fullname, "wb") + print("Created", part_fullname) + part.write(data) + else: + if part: + part.close() + part = None + finally: + if part: + part.close() + + if self.compression == ZIP_STORED: + print(f"Creating sparse parts to check in into {TEST_DATA_DIR}:") + make_sparse_zip_parts(TEMP_ZIP) + + else: + def extract_offset(name): + if m := re.search(r"-(0x[0-9a-f]{9})\.part$", name): + return int(m.group(1), base=16) + raise ValueError(f"{name=} does not fit expected pattern.") + offset_parts = [(extract_offset(n), n) for n in pre_built_zip_parts] + with open(TEMP_ZIP, "wb") as f: + for offset, part_fn in sorted(offset_parts): + with open(part_fn, "rb") as part: + f.seek(offset, os.SEEK_SET) + f.write(part.read()) + # Confirm that the reconstructed zip file works and looks right. + with ZipFile(TEMP_ZIP, "r") as z: + self.assertEqual( + z.getinfo("module.py").date_time, (1980, 1, 1, 0, 0, 0) + ) + self.assertEqual( + z.read("module.py"), test_src.encode(), + msg=f"Recreate {full_parts_glob}, unexpected contents." + ) + def assertDataEntry(name): + zinfo = z.getinfo(name) + self.assertEqual(zinfo.date_time, (1980, 1, 1, 0, 0, 0)) + self.assertGreater(zinfo.file_size, 0xffff_ffff) + assertDataEntry("data1") + assertDataEntry("data2") + + self.doTestWithPreBuiltZip(".py", "module") + @support.requires_zlib() class CompressedZipImportTestCase(UncompressedZipImportTestCase): @@ -801,6 +944,7 @@ def testEmptyFile(self): os_helper.create_empty_file(TESTMOD) self.assertZipFailure(TESTMOD) + @unittest.skipIf(support.is_wasi, "mode 000 not supported.") def testFileUnreadable(self): os_helper.unlink(TESTMOD) fd = os.open(TESTMOD, os.O_CREAT, 000) @@ -844,7 +988,6 @@ def _testBogusZipFile(self): self.assertRaises(TypeError, z.get_source, None) error = zipimport.ZipImportError - self.assertIsNone(z.find_module('abc')) self.assertIsNone(z.find_spec('abc')) with warnings.catch_warnings(): diff --git a/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part b/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part new file mode 100644 index 00000000000..c6beae8e255 Binary files /dev/null and b/Lib/test/zipimport_data/sparse-zip64-c0-0x000000000.part differ diff --git a/Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part b/Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part new file mode 100644 index 00000000000..74ab03b4648 Binary files /dev/null and b/Lib/test/zipimport_data/sparse-zip64-c0-0x100000000.part differ diff --git a/Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part b/Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part new file mode 100644 index 00000000000..9769a404f67 Binary files /dev/null and b/Lib/test/zipimport_data/sparse-zip64-c0-0x200000000.part differ diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py index 05f387a950b..c01f13729e1 100644 --- a/Lib/zipfile/__init__.py +++ b/Lib/zipfile/__init__.py @@ -245,7 +245,7 @@ def is_zipfile(filename): else: with open(filename, "rb") as fp: result = _check_zipfile(fp) - except OSError: + except (OSError, BadZipFile): pass return result @@ -253,16 +253,15 @@ def _EndRecData64(fpin, offset, endrec): """ Read the ZIP64 end-of-archive records and use that to update endrec """ - try: - fpin.seek(offset - sizeEndCentDir64Locator, 2) - except OSError: - # If the seek fails, the file is not large enough to contain a ZIP64 + offset -= sizeEndCentDir64Locator + if offset < 0: + # The file is not large enough to contain a ZIP64 # end-of-archive record, so just return the end record we were given. return endrec - + fpin.seek(offset) data = fpin.read(sizeEndCentDir64Locator) if len(data) != sizeEndCentDir64Locator: - return endrec + raise OSError("Unknown I/O error") sig, diskno, reloff, disks = struct.unpack(structEndArchive64Locator, data) if sig != stringEndArchive64Locator: return endrec @@ -270,16 +269,33 @@ def _EndRecData64(fpin, offset, endrec): if diskno != 0 or disks > 1: raise BadZipFile("zipfiles that span multiple disks are not supported") - # Assume no 'zip64 extensible data' - fpin.seek(offset - sizeEndCentDir64Locator - sizeEndCentDir64, 2) + offset -= sizeEndCentDir64 + if reloff > offset: + raise BadZipFile("Corrupt zip64 end of central directory locator") + # First, check the assumption that there is no prepended data. + fpin.seek(reloff) + extrasz = offset - reloff data = fpin.read(sizeEndCentDir64) if len(data) != sizeEndCentDir64: - return endrec + raise OSError("Unknown I/O error") + if not data.startswith(stringEndArchive64) and reloff != offset: + # Since we already have seen the Zip64 EOCD Locator, it's + # possible we got here because there is prepended data. + # Assume no 'zip64 extensible data' + fpin.seek(offset) + extrasz = 0 + data = fpin.read(sizeEndCentDir64) + if len(data) != sizeEndCentDir64: + raise OSError("Unknown I/O error") + if not data.startswith(stringEndArchive64): + raise BadZipFile("Zip64 end of central directory record not found") + sig, sz, create_version, read_version, disk_num, disk_dir, \ dircount, dircount2, dirsize, diroffset = \ struct.unpack(structEndArchive64, data) - if sig != stringEndArchive64: - return endrec + if (diroffset + dirsize != reloff or + sz + 12 != sizeEndCentDir64 + extrasz): + raise BadZipFile("Corrupt zip64 end of central directory record") # Update the original endrec using data from the ZIP64 record endrec[_ECD_SIGNATURE] = sig @@ -289,6 +305,7 @@ def _EndRecData64(fpin, offset, endrec): endrec[_ECD_ENTRIES_TOTAL] = dircount2 endrec[_ECD_SIZE] = dirsize endrec[_ECD_OFFSET] = diroffset + endrec[_ECD_LOCATION] = offset - extrasz return endrec @@ -322,7 +339,7 @@ def _EndRecData(fpin): endrec.append(filesize - sizeEndCentDir) # Try to read the "Zip64 end of central directory" structure - return _EndRecData64(fpin, -sizeEndCentDir, endrec) + return _EndRecData64(fpin, filesize - sizeEndCentDir, endrec) # Either this is not a ZIP file, or it is a ZIP file with an archive # comment. Search the end of the file for the "end of central directory" @@ -346,8 +363,7 @@ def _EndRecData(fpin): endrec.append(maxCommentStart + start) # Try to read the "Zip64 end of central directory" structure - return _EndRecData64(fpin, maxCommentStart + start - filesize, - endrec) + return _EndRecData64(fpin, maxCommentStart + start, endrec) # Unable to find a valid end of central directory structure return None @@ -1458,9 +1474,6 @@ def _RealGetContents(self): # "concat" is zero, unless zip was concatenated to another file concat = endrec[_ECD_LOCATION] - size_cd - offset_cd - if endrec[_ECD_SIGNATURE] == stringEndArchive64: - # If Zip64 extension structures are present, account for them - concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) if self.debug > 2: inferred = concat + offset_cd @@ -2082,7 +2095,7 @@ def _write_end_record(self): " would require ZIP64 extensions") zip64endrec = struct.pack( structEndArchive64, stringEndArchive64, - 44, 45, 45, 0, 0, centDirCount, centDirCount, + sizeEndCentDir64 - 12, 45, 45, 0, 0, centDirCount, centDirCount, centDirSize, centDirOffset) self.fp.write(zip64endrec) diff --git a/Lib/zipimport.py b/Lib/zipimport.py index 25eaee9c0f2..fb312be115e 100644 --- a/Lib/zipimport.py +++ b/Lib/zipimport.py @@ -1,11 +1,9 @@ """zipimport provides support for importing Python modules from Zip archives. -This module exports three objects: +This module exports two objects: - zipimporter: a class; its constructor takes a path to a Zip archive. - ZipImportError: exception raised by zipimporter objects. It's a subclass of ImportError, so it can be caught as ImportError, too. -- _zip_directory_cache: a dict, mapping archive paths to zip directory - info dicts, as used in zipimporter._files. It is usually not needed to use the zipimport module explicitly; it is used by the builtin import mechanism for sys.path items that are paths @@ -15,7 +13,7 @@ #from importlib import _bootstrap_external #from importlib import _bootstrap # for _verbose_message import _frozen_importlib_external as _bootstrap_external -from _frozen_importlib_external import _unpack_uint16, _unpack_uint32 +from _frozen_importlib_external import _unpack_uint16, _unpack_uint32, _unpack_uint64 import _frozen_importlib as _bootstrap # for _verbose_message import _imp # for check_hash_based_pycs import _io # for open @@ -40,8 +38,14 @@ class ZipImportError(ImportError): _module_type = type(sys) END_CENTRAL_DIR_SIZE = 22 -STRING_END_ARCHIVE = b'PK\x05\x06' +END_CENTRAL_DIR_SIZE_64 = 56 +END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20 +STRING_END_ARCHIVE = b'PK\x05\x06' # standard EOCD signature +STRING_END_LOCATOR_64 = b'PK\x06\x07' # Zip64 EOCD Locator signature +STRING_END_ZIP_64 = b'PK\x06\x06' # Zip64 EOCD signature MAX_COMMENT_LEN = (1 << 16) - 1 +MAX_UINT32 = 0xffffffff +ZIP64_EXTRA_TAG = 0x1 class zipimporter(_bootstrap_external._LoaderBasics): """zipimporter(archivepath) -> zipimporter object @@ -63,8 +67,7 @@ class zipimporter(_bootstrap_external._LoaderBasics): # if found, or else read it from the archive. def __init__(self, path): if not isinstance(path, str): - import os - path = os.fsdecode(path) + raise TypeError(f"expected str, not {type(path)!r}") if not path: raise ZipImportError('archive path is empty', path=path) if alt_path_sep: @@ -89,12 +92,8 @@ def __init__(self, path): raise ZipImportError('not a Zip file', path=path) break - try: - files = _zip_directory_cache[path] - except KeyError: - files = _read_directory(path) - _zip_directory_cache[path] = files - self._files = files + if path not in _zip_directory_cache: + _zip_directory_cache[path] = _read_directory(path) self.archive = path # a prefix directory following the ZIP file path. self.prefix = _bootstrap_external._path_join(*prefix[::-1]) @@ -102,64 +101,6 @@ def __init__(self, path): self.prefix += path_sep - # Check whether we can satisfy the import of the module named by - # 'fullname', or whether it could be a portion of a namespace - # package. Return self if we can load it, a string containing the - # full path if it's a possible namespace portion, None if we - # can't load it. - def find_loader(self, fullname, path=None): - """find_loader(fullname, path=None) -> self, str or None. - - Search for a module specified by 'fullname'. 'fullname' must be the - fully qualified (dotted) module name. It returns the zipimporter - instance itself if the module was found, a string containing the - full path name if it's possibly a portion of a namespace package, - or None otherwise. The optional 'path' argument is ignored -- it's - there for compatibility with the importer protocol. - - Deprecated since Python 3.10. Use find_spec() instead. - """ - _warnings.warn("zipimporter.find_loader() is deprecated and slated for " - "removal in Python 3.12; use find_spec() instead", - DeprecationWarning) - mi = _get_module_info(self, fullname) - if mi is not None: - # This is a module or package. - return self, [] - - # Not a module or regular package. See if this is a directory, and - # therefore possibly a portion of a namespace package. - - # We're only interested in the last path component of fullname - # earlier components are recorded in self.prefix. - modpath = _get_module_path(self, fullname) - if _is_dir(self, modpath): - # This is possibly a portion of a namespace - # package. Return the string representing its path, - # without a trailing separator. - return None, [f'{self.archive}{path_sep}{modpath}'] - - return None, [] - - - # Check whether we can satisfy the import of the module named by - # 'fullname'. Return self if we can, None if we can't. - def find_module(self, fullname, path=None): - """find_module(fullname, path=None) -> self or None. - - Search for a module specified by 'fullname'. 'fullname' must be the - fully qualified (dotted) module name. It returns the zipimporter - instance itself if the module was found, or None if it wasn't. - The optional 'path' argument is ignored -- it's there for compatibility - with the importer protocol. - - Deprecated since Python 3.10. Use find_spec() instead. - """ - _warnings.warn("zipimporter.find_module() is deprecated and slated for " - "removal in Python 3.12; use find_spec() instead", - DeprecationWarning) - return self.find_loader(fullname, path)[0] - def find_spec(self, fullname, target=None): """Create a ModuleSpec for the specified module. @@ -211,7 +152,7 @@ def get_data(self, pathname): key = pathname[len(self.archive + path_sep):] try: - toc_entry = self._files[key] + toc_entry = self._get_files()[key] except KeyError: raise OSError(0, '', key) return _get_data(self.archive, toc_entry) @@ -248,7 +189,7 @@ def get_source(self, fullname): fullpath = f'{path}.py' try: - toc_entry = self._files[fullpath] + toc_entry = self._get_files()[fullpath] except KeyError: # we have the module, but no source return None @@ -313,28 +254,28 @@ def load_module(self, fullname): def get_resource_reader(self, fullname): - """Return the ResourceReader for a package in a zip file. - - If 'fullname' is a package within the zip file, return the - 'ResourceReader' object for the package. Otherwise return None. - """ - try: - if not self.is_package(fullname): - return None - except ZipImportError: - return None + """Return the ResourceReader for a module in a zip file.""" from importlib.readers import ZipReader + return ZipReader(self, fullname) - def invalidate_caches(self): - """Reload the file data of the archive path.""" + def _get_files(self): + """Return the files within the archive path.""" try: - self._files = _read_directory(self.archive) - _zip_directory_cache[self.archive] = self._files - except ZipImportError: - _zip_directory_cache.pop(self.archive, None) - self._files = {} + files = _zip_directory_cache[self.archive] + except KeyError: + try: + files = _zip_directory_cache[self.archive] = _read_directory(self.archive) + except ZipImportError: + files = {} + + return files + + + def invalidate_caches(self): + """Invalidates the cache of file data of the archive path.""" + _zip_directory_cache.pop(self.archive, None) def __repr__(self): @@ -364,15 +305,15 @@ def _is_dir(self, path): # of a namespace package. We test by seeing if the name, with an # appended path separator, exists. dirpath = path + path_sep - # If dirpath is present in self._files, we have a directory. - return dirpath in self._files + # If dirpath is present in self._get_files(), we have a directory. + return dirpath in self._get_files() # Return some information about a module. def _get_module_info(self, fullname): path = _get_module_path(self, fullname) for suffix, isbytecode, ispackage in _zip_searchorder: fullpath = path + suffix - if fullpath in self._files: + if fullpath in self._get_files(): return ispackage return None @@ -406,16 +347,11 @@ def _read_directory(archive): raise ZipImportError(f"can't open Zip file: {archive!r}", path=archive) with fp: + # GH-87235: On macOS all file descriptors for /dev/fd/N share the same + # file offset, reset the file offset after scanning the zipfile directory + # to not cause problems when some runs 'python3 /dev/fd/9 9= 0 and pos64+END_CENTRAL_DIR_SIZE_64+END_CENTRAL_DIR_LOCATOR_SIZE_64==pos): + # Zip64 at "correct" offset from standard EOCD + buffer = data[pos64:pos64 + END_CENTRAL_DIR_SIZE_64] + if len(buffer) != END_CENTRAL_DIR_SIZE_64: + raise ZipImportError( + f"corrupt Zip64 file: Expected {END_CENTRAL_DIR_SIZE_64} byte " + f"zip64 central directory, but read {len(buffer)} bytes.", + path=archive) + header_position = file_size - len(data) + pos64 + + central_directory_size = _unpack_uint64(buffer[40:48]) + central_directory_position = _unpack_uint64(buffer[48:56]) + num_entries = _unpack_uint64(buffer[24:32]) + elif pos >= 0: + buffer = data[pos:pos+END_CENTRAL_DIR_SIZE] + if len(buffer) != END_CENTRAL_DIR_SIZE: + raise ZipImportError(f"corrupt Zip file: {archive!r}", + path=archive) + + header_position = file_size - len(data) + pos + + # Buffer now contains a valid EOCD, and header_position gives the + # starting position of it. + central_directory_size = _unpack_uint32(buffer[12:16]) + central_directory_position = _unpack_uint32(buffer[16:20]) + num_entries = _unpack_uint16(buffer[8:10]) + + # N.b. if someday you want to prefer the standard (non-zip64) EOCD, + # you need to adjust position by 76 for arc to be 0. + else: raise ZipImportError(f'not a Zip file: {archive!r}', path=archive) - buffer = data[pos:pos+END_CENTRAL_DIR_SIZE] - if len(buffer) != END_CENTRAL_DIR_SIZE: - raise ZipImportError(f"corrupt Zip file: {archive!r}", - path=archive) - header_position = file_size - len(data) + pos - - header_size = _unpack_uint32(buffer[12:16]) - header_offset = _unpack_uint32(buffer[16:20]) - if header_position < header_size: - raise ZipImportError(f'bad central directory size: {archive!r}', path=archive) - if header_position < header_offset: - raise ZipImportError(f'bad central directory offset: {archive!r}', path=archive) - header_position -= header_size - arc_offset = header_position - header_offset - if arc_offset < 0: - raise ZipImportError(f'bad central directory size or offset: {archive!r}', path=archive) - - files = {} - # Start of Central Directory - count = 0 - try: - fp.seek(header_position) - except OSError: - raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) - while True: - buffer = fp.read(46) - if len(buffer) < 4: - raise EOFError('EOF read where not expected') - # Start of file header - if buffer[:4] != b'PK\x01\x02': - break # Bad: Central Dir File Header - if len(buffer) != 46: - raise EOFError('EOF read where not expected') - flags = _unpack_uint16(buffer[8:10]) - compress = _unpack_uint16(buffer[10:12]) - time = _unpack_uint16(buffer[12:14]) - date = _unpack_uint16(buffer[14:16]) - crc = _unpack_uint32(buffer[16:20]) - data_size = _unpack_uint32(buffer[20:24]) - file_size = _unpack_uint32(buffer[24:28]) - name_size = _unpack_uint16(buffer[28:30]) - extra_size = _unpack_uint16(buffer[30:32]) - comment_size = _unpack_uint16(buffer[32:34]) - file_offset = _unpack_uint32(buffer[42:46]) - header_size = name_size + extra_size + comment_size - if file_offset > header_offset: - raise ZipImportError(f'bad local header offset: {archive!r}', path=archive) - file_offset += arc_offset + # Buffer now contains a valid EOCD, and header_position gives the + # starting position of it. + # XXX: These are cursory checks but are not as exact or strict as they + # could be. Checking the arc-adjusted value is probably good too. + if header_position < central_directory_size: + raise ZipImportError(f'bad central directory size: {archive!r}', path=archive) + if header_position < central_directory_position: + raise ZipImportError(f'bad central directory offset: {archive!r}', path=archive) + header_position -= central_directory_size + # On just-a-zipfile these values are the same and arc_offset is zero; if + # the file has some bytes prepended, `arc_offset` is the number of such + # bytes. This is used for pex as well as self-extracting .exe. + arc_offset = header_position - central_directory_position + if arc_offset < 0: + raise ZipImportError(f'bad central directory size or offset: {archive!r}', path=archive) + + files = {} + # Start of Central Directory + count = 0 try: - name = fp.read(name_size) - except OSError: - raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) - if len(name) != name_size: - raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) - # On Windows, calling fseek to skip over the fields we don't use is - # slower than reading the data because fseek flushes stdio's - # internal buffers. See issue #8745. - try: - if len(fp.read(header_size - name_size)) != header_size - name_size: - raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) + fp.seek(header_position) except OSError: raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) + while True: + buffer = fp.read(46) + if len(buffer) < 4: + raise EOFError('EOF read where not expected') + # Start of file header + if buffer[:4] != b'PK\x01\x02': + if count != num_entries: + raise ZipImportError( + f"mismatched num_entries: {count} should be {num_entries} in {archive!r}", + path=archive, + ) + break # Bad: Central Dir File Header + if len(buffer) != 46: + raise EOFError('EOF read where not expected') + flags = _unpack_uint16(buffer[8:10]) + compress = _unpack_uint16(buffer[10:12]) + time = _unpack_uint16(buffer[12:14]) + date = _unpack_uint16(buffer[14:16]) + crc = _unpack_uint32(buffer[16:20]) + data_size = _unpack_uint32(buffer[20:24]) + file_size = _unpack_uint32(buffer[24:28]) + name_size = _unpack_uint16(buffer[28:30]) + extra_size = _unpack_uint16(buffer[30:32]) + comment_size = _unpack_uint16(buffer[32:34]) + file_offset = _unpack_uint32(buffer[42:46]) + header_size = name_size + extra_size + comment_size - if flags & 0x800: - # UTF-8 file names extension - name = name.decode() - else: - # Historical ZIP filename encoding try: - name = name.decode('ascii') - except UnicodeDecodeError: - name = name.decode('latin1').translate(cp437_table) - - name = name.replace('/', path_sep) - path = _bootstrap_external._path_join(archive, name) - t = (path, compress, data_size, file_size, file_offset, time, date, crc) - files[name] = t - count += 1 + name = fp.read(name_size) + except OSError: + raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) + if len(name) != name_size: + raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) + # On Windows, calling fseek to skip over the fields we don't use is + # slower than reading the data because fseek flushes stdio's + # internal buffers. See issue #8745. + try: + extra_data_len = header_size - name_size + extra_data = memoryview(fp.read(extra_data_len)) + + if len(extra_data) != extra_data_len: + raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) + except OSError: + raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive) + + if flags & 0x800: + # UTF-8 file names extension + name = name.decode() + else: + # Historical ZIP filename encoding + try: + name = name.decode('ascii') + except UnicodeDecodeError: + name = name.decode('latin1').translate(cp437_table) + + name = name.replace('/', path_sep) + path = _bootstrap_external._path_join(archive, name) + + # Ordering matches unpacking below. + if ( + file_size == MAX_UINT32 or + data_size == MAX_UINT32 or + file_offset == MAX_UINT32 + ): + # need to decode extra_data looking for a zip64 extra (which might not + # be present) + while extra_data: + if len(extra_data) < 4: + raise ZipImportError(f"can't read header extra: {archive!r}", path=archive) + tag = _unpack_uint16(extra_data[:2]) + size = _unpack_uint16(extra_data[2:4]) + if len(extra_data) < 4 + size: + raise ZipImportError(f"can't read header extra: {archive!r}", path=archive) + if tag == ZIP64_EXTRA_TAG: + if (len(extra_data) - 4) % 8 != 0: + raise ZipImportError(f"can't read header extra: {archive!r}", path=archive) + num_extra_values = (len(extra_data) - 4) // 8 + if num_extra_values > 3: + raise ZipImportError(f"can't read header extra: {archive!r}", path=archive) + import struct + values = list(struct.unpack_from(f"<{min(num_extra_values, 3)}Q", + extra_data, offset=4)) + + # N.b. Here be dragons: the ordering of these is different than + # the header fields, and it's really easy to get it wrong since + # naturally-occuring zips that use all 3 are >4GB + if file_size == MAX_UINT32: + file_size = values.pop(0) + if data_size == MAX_UINT32: + data_size = values.pop(0) + if file_offset == MAX_UINT32: + file_offset = values.pop(0) + + break + + # For a typical zip, this bytes-slicing only happens 2-3 times, on + # small data like timestamps and filesizes. + extra_data = extra_data[4+size:] + else: + _bootstrap._verbose_message( + "zipimport: suspected zip64 but no zip64 extra for {!r}", + path, + ) + # XXX These two statements seem swapped because `central_directory_position` + # is a position within the actual file, but `file_offset` (when compared) is + # as encoded in the entry, not adjusted for this file. + # N.b. this must be after we've potentially read the zip64 extra which can + # change `file_offset`. + if file_offset > central_directory_position: + raise ZipImportError(f'bad local header offset: {archive!r}', path=archive) + file_offset += arc_offset + + t = (path, compress, data_size, file_size, file_offset, time, date, crc) + files[name] = t + count += 1 + finally: + fp.seek(start_offset) _bootstrap._verbose_message('zipimport: found {} names in {!r}', count, archive) return files @@ -708,7 +739,7 @@ def _get_mtime_and_size_of_source(self, path): # strip 'c' or 'o' from *.py[co] assert path[-1:] in ('c', 'o') path = path[:-1] - toc_entry = self._files[path] + toc_entry = self._get_files()[path] # fetch the time stamp of the .py file for comparison # with an embedded pyc time stamp time = toc_entry[5] @@ -728,7 +759,7 @@ def _get_pyc_source(self, path): path = path[:-1] try: - toc_entry = self._files[path] + toc_entry = self._get_files()[path] except KeyError: return None else: @@ -744,7 +775,7 @@ def _get_module_code(self, fullname): fullpath = path + suffix _bootstrap._verbose_message('trying {}{}{}', self.archive, path_sep, fullpath, verbosity=2) try: - toc_entry = self._files[fullpath] + toc_entry = self._get_files()[fullpath] except KeyError: pass else: