Created
June 17, 2021 08:34
-
-
Save paulwinex/86e44b9411fbf8aee583be8d5b6a43bb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tarfile | |
import hashlib | |
import timeit | |
import os | |
from pathlib import Path | |
# create random test files | |
files_to_archive = [] | |
for i in range(5): | |
name = f'example_file{i}.txt' | |
open(name, 'wb').write(os.urandom(10**7)) | |
files_to_archive.append(name) | |
# TAR ########################################################## | |
def create_tar(archive_path, files): | |
# create tar archive | |
with tarfile.open(archive_path, 'w:gz') as tar: | |
for file in files: | |
tar.add(file) | |
# create test archives | |
create_tar('archive1.tar.gz', files_to_archive) | |
create_tar('archive2.tar.gz', files_to_archive) | |
# check sum for tar | |
hashlib.md5(open("archive1.tar.gz", "rb").read()).hexdigest() | |
# '3c1b723560fb747028528bd7c2b4f92e' | |
hashlib.md5(open("archive2.tar.gz", "rb").read()).hexdigest() | |
# '965a723cf0ed3b30e8bb4c47acde9999' | |
def get_hash_tar(path): | |
# read raw data from archive | |
hsum = hashlib.md5() | |
with tarfile.open(path) as tar: | |
for file in tar.getmembers(): | |
hsum.update(tar.extractfile(file).read()) | |
return hsum.hexdigest() | |
get_hash_tar('archive1.tar.gz') == get_hash_tar('archive2.tar.gz') | |
# True | |
def get_hash_tar2(path): | |
# read chksum | |
hsum = hashlib.md5() | |
with tarfile.open(path) as tar: | |
for file in tar.getmembers(): | |
hsum.update(file.chksum.to_bytes(8, byteorder='big')) | |
return hsum.hexdigest() | |
get_hash_tar2('archive1.tar.gz') == get_hash_tar2('archive2.tar.gz') | |
# Ture | |
# check time | |
timeit.timeit("hashlib.md5(open('archive1.tar.gz', 'rb').read()).hexdigest()", number=100, globals=globals()) | |
# 7.2 | |
timeit.timeit("get_hash_tar1('archive1.tar.gz')", number=100, globals=globals()) | |
# 29.8 | |
timeit.timeit("get_hash_tar2('archive1.tar.gz')", number=100, globals=globals()) | |
# 10.0 | |
# ZIP ########################################################## | |
import zipfile | |
def create_zip(archive_path, files): | |
# create ZIP archive | |
with zipfile.ZipFile(archive_path, "w") as zf: | |
for file in files: | |
zf.write(file) | |
create_zip('archive1.zip', files_to_archive) | |
for f in files_to_archive: | |
Path(f).touch() | |
create_zip('archive2.zip', files_to_archive) | |
# check sum | |
hashlib.md5(open("archive1.zip", "rb").read()).hexdigest() | |
# 'aa508dbba4e223abe45e16dba4ad6e1f' | |
hashlib.md5(open("archive2.zip", "rb").read()).hexdigest() | |
# '4891787b159228626da7064ce216ff76' | |
def get_hash_zip(path): | |
# read raw data | |
hash_md5 = hashlib.md5() | |
with zipfile.ZipFile(path, "r") as z: | |
for f_name in z.namelist(): | |
with z.open(f_name) as f: | |
hash_md5.update(f.read()) | |
return hash_md5.hexdigest() | |
get_hash_zip('archive1.zip') == get_hash_zip('archive2.zip') | |
# True | |
def get_hash_zip2(path): | |
# read CRC | |
h = hashlib.md5() | |
for info in zipfile.ZipFile(path).infolist(): | |
h.update(info.CRC.to_bytes(8, byteorder='big')) | |
return h.hexdigest() | |
get_hash_zip2('archive1.zip') == get_hash_zip2('archive2.zip') | |
# time it | |
timeit.timeit("hashlib.md5(open('archive1.zip', 'rb').read()).hexdigest()", number=100, globals=globals()) | |
# 10.6 | |
timeit.timeit("get_hash_zip('archive1.zip')", number=100, globals=globals()) | |
# 12.7 | |
timeit.timeit("get_hash_zip2('archive1.zip')", number=100, globals=globals()) | |
# 0.01 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment