Skip to content

Instantly share code, notes, and snippets.

@cmpute
Created November 13, 2020 00:22
Show Gist options
  • Save cmpute/5a0558b6db5ebeff6452cbe0684cdc90 to your computer and use it in GitHub Desktop.
Save cmpute/5a0558b6db5ebeff6452cbe0684cdc90 to your computer and use it in GitHub Desktop.
Patch Python built-in ZipFile class for faster file reading
"""
This moduled provide patched version of builtin Zipfile class as in https://github.com/ThomasPinna/python_zipfile_improvement
You can have better speed when read several files from a zip file containing a large number of files
Modifications are marked with '===== PATCH ====='
"""
import io
import struct
from zipfile import *
from zipfile import (_CD_COMMENT_LENGTH, _CD_EXTRA_FIELD_LENGTH,
_CD_FILENAME_LENGTH, _CD_LOCAL_HEADER_OFFSET,
_CD_SIGNATURE, _ECD_COMMENT, _ECD_LOCATION, _ECD_OFFSET,
_ECD_SIGNATURE, _ECD_SIZE, _EndRecData, sizeCentralDir,
sizeEndCentDir64, sizeEndCentDir64Locator,
stringCentralDir, stringEndArchive64, structCentralDir, MAX_EXTRACT_VERSION)
__all__ = ["PatchedZipFile"]
class PatchedZipFile(ZipFile):
def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True,
to_extract=[]):
self.to_extract = set(to_extract)
super().__init__(file=file, mode=mode,
compression=compression,
allowZip64=allowZip64)
# patched implementation to reduce open time
def _RealGetContents(self):
"""Read in the table of contents for the ZIP file."""
fp = self.fp
try:
endrec = _EndRecData(fp)
except OSError:
raise BadZipFile("File is not a zip file")
if not endrec:
raise BadZipFile("File is not a zip file")
if self.debug > 1:
print(endrec)
size_cd = endrec[_ECD_SIZE] # bytes in central directory
offset_cd = endrec[_ECD_OFFSET] # offset of central directory
self._comment = endrec[_ECD_COMMENT] # archive comment
# "concat" is zero, unless zip was concatenated to another file
concat = endrec[_ECD_LOCATION] - size_cd - offset_cd
if endrec[_ECD_SIGNATURE] == stringEndArchive64:
# If Zip64 extension structures are present, account for them
concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator)
if self.debug > 2:
inferred = concat + offset_cd
print("given, inferred, offset", offset_cd, inferred, concat)
# self.start_dir: Position of start of central directory
self.start_dir = offset_cd + concat
fp.seek(self.start_dir, 0)
data = fp.read(size_cd)
fp = io.BytesIO(data)
total = 0
while total < size_cd:
centdir = fp.read(sizeCentralDir)
if len(centdir) != sizeCentralDir:
raise BadZipFile("Truncated central directory")
centdir = struct.unpack(structCentralDir, centdir)
if centdir[_CD_SIGNATURE] != stringCentralDir:
raise BadZipFile("Bad magic number for central directory")
if self.debug > 2:
print(centdir)
filename = fp.read(centdir[_CD_FILENAME_LENGTH])
flags = centdir[5]
if flags & 0x800:
# UTF-8 file names extension
filename = filename.decode('utf-8')
else:
# Historical ZIP filename encoding
filename = filename.decode('cp437')
# ===== PATCH =====
if filename not in self.to_extract:
fp.seek(centdir[_CD_EXTRA_FIELD_LENGTH] + centdir[_CD_COMMENT_LENGTH], 1)
continue
# =================
# Create ZipInfo instance to store file information
x = ZipInfo(filename)
x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH])
x.comment = fp.read(centdir[_CD_COMMENT_LENGTH])
x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET]
(x.create_version, x.create_system, x.extract_version, x.reserved,
x.flag_bits, x.compress_type, t, d,
x.CRC, x.compress_size, x.file_size) = centdir[1:12]
if x.extract_version > MAX_EXTRACT_VERSION:
raise NotImplementedError("zip file version %.1f" %
(x.extract_version / 10))
x.volume, x.internal_attr, x.external_attr = centdir[15:18]
# Convert date/time code to (year, month, day, hour, min, sec)
x._raw_time = t
x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
x._decodeExtra()
x.header_offset = x.header_offset + concat
self.filelist.append(x)
self.NameToInfo[x.filename] = x
# update total bytes read from central directory
total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]
+ centdir[_CD_EXTRA_FIELD_LENGTH]
+ centdir[_CD_COMMENT_LENGTH])
if self.debug > 2:
print("total", total)
# ===== PATCH =====
self.to_extract.remove(filename)
if not len(self.to_extract):
break
# =================
# benchmark
if __name__ == "__main__":
test_zip = "/mnt/storage8t/datasets/KITTI-360/2013_05_28_drive_0000_sync_velodyne.zip"
test_file = "2013_05_28_drive_0000_sync/velodyne_points/data/0000000001.bin"
import hashlib
import time
tstart = time.time()
with ZipFile(test_zip) as z:
data1 = z.read(test_file)
print("Built-in read time:", time.time() - tstart)
tstart = time.time()
with PatchedZipFile(test_zip, to_extract=[test_file]) as z:
data2 = z.read(test_file)
print("Patched read time:", time.time() - tstart)
assert hashlib.md5(data1).hexdigest() == hashlib.md5(data2).hexdigest()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment