Skip to content

Instantly share code, notes, and snippets.

@paulwinex
Created June 17, 2021 08:34
Show Gist options
  • Save paulwinex/86e44b9411fbf8aee583be8d5b6a43bb to your computer and use it in GitHub Desktop.
Save paulwinex/86e44b9411fbf8aee583be8d5b6a43bb to your computer and use it in GitHub Desktop.
import tarfile
import hashlib
import timeit
import os
from pathlib import Path
# create random test files
files_to_archive = []
for i in range(5):
name = f'example_file{i}.txt'
open(name, 'wb').write(os.urandom(10**7))
files_to_archive.append(name)
# TAR ##########################################################
def create_tar(archive_path, files):
# create tar archive
with tarfile.open(archive_path, 'w:gz') as tar:
for file in files:
tar.add(file)
# create test archives
create_tar('archive1.tar.gz', files_to_archive)
create_tar('archive2.tar.gz', files_to_archive)
# check sum for tar
hashlib.md5(open("archive1.tar.gz", "rb").read()).hexdigest()
# '3c1b723560fb747028528bd7c2b4f92e'
hashlib.md5(open("archive2.tar.gz", "rb").read()).hexdigest()
# '965a723cf0ed3b30e8bb4c47acde9999'
def get_hash_tar(path):
# read raw data from archive
hsum = hashlib.md5()
with tarfile.open(path) as tar:
for file in tar.getmembers():
hsum.update(tar.extractfile(file).read())
return hsum.hexdigest()
get_hash_tar('archive1.tar.gz') == get_hash_tar('archive2.tar.gz')
# True
def get_hash_tar2(path):
# read chksum
hsum = hashlib.md5()
with tarfile.open(path) as tar:
for file in tar.getmembers():
hsum.update(file.chksum.to_bytes(8, byteorder='big'))
return hsum.hexdigest()
get_hash_tar2('archive1.tar.gz') == get_hash_tar2('archive2.tar.gz')
# Ture
# check time
timeit.timeit("hashlib.md5(open('archive1.tar.gz', 'rb').read()).hexdigest()", number=100, globals=globals())
# 7.2
timeit.timeit("get_hash_tar1('archive1.tar.gz')", number=100, globals=globals())
# 29.8
timeit.timeit("get_hash_tar2('archive1.tar.gz')", number=100, globals=globals())
# 10.0
# ZIP ##########################################################
import zipfile
def create_zip(archive_path, files):
# create ZIP archive
with zipfile.ZipFile(archive_path, "w") as zf:
for file in files:
zf.write(file)
create_zip('archive1.zip', files_to_archive)
for f in files_to_archive:
Path(f).touch()
create_zip('archive2.zip', files_to_archive)
# check sum
hashlib.md5(open("archive1.zip", "rb").read()).hexdigest()
# 'aa508dbba4e223abe45e16dba4ad6e1f'
hashlib.md5(open("archive2.zip", "rb").read()).hexdigest()
# '4891787b159228626da7064ce216ff76'
def get_hash_zip(path):
# read raw data
hash_md5 = hashlib.md5()
with zipfile.ZipFile(path, "r") as z:
for f_name in z.namelist():
with z.open(f_name) as f:
hash_md5.update(f.read())
return hash_md5.hexdigest()
get_hash_zip('archive1.zip') == get_hash_zip('archive2.zip')
# True
def get_hash_zip2(path):
# read CRC
h = hashlib.md5()
for info in zipfile.ZipFile(path).infolist():
h.update(info.CRC.to_bytes(8, byteorder='big'))
return h.hexdigest()
get_hash_zip2('archive1.zip') == get_hash_zip2('archive2.zip')
# time it
timeit.timeit("hashlib.md5(open('archive1.zip', 'rb').read()).hexdigest()", number=100, globals=globals())
# 10.6
timeit.timeit("get_hash_zip('archive1.zip')", number=100, globals=globals())
# 12.7
timeit.timeit("get_hash_zip2('archive1.zip')", number=100, globals=globals())
# 0.01
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment