Created
June 13, 2020 13:21
-
-
Save tebeka/9affc689e9e8f40ed1fa4b2c9f9f60fb to your computer and use it in GitHub Desktop.
Check sha256 of bzipped files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Checking sha25 signatures | |
Write a function that gets an index file with names of files and sha256 | |
signatures in the following format | |
0c4ccc63a912bbd6d45174251415c089522e5c0e75286794ab1f86cb8e2561fd taxi-01.csv | |
f427b5880e9164ec1e6cda53aa4b2d1f1e470da973e5b51748c806ea5c57cbdf taxi-02.csv | |
4e251e9e98c5cb7be8b34adfcb46cc806a4ef5ec8c95ba9aac5ff81449fc630c taxi-03.csv | |
... | |
You should compute concurrently sha256 signatures of these files and see if | |
they math the ones in the index file. | |
- Print the number of processed files | |
- If there's a mismatch, print the offending file(s) and exit the program with | |
non-zero value | |
Get taxi.zip from the web site and open it. The index file is sha256sum.txt | |
""" | |
import bz2 | |
from contextlib import contextmanager | |
from functools import partial | |
from hashlib import sha256 | |
from time import monotonic | |
chunk_size = 10 * 1024 | |
def file_sha256(path): | |
h = sha256() | |
with bz2.open(path) as fp: | |
for chunk in iter(partial(fp.read, chunk_size), b''): | |
h.update(chunk) | |
return h.hexdigest() | |
def parse_sig_file(path): | |
with open(path) as fp: | |
for line in fp: | |
yield line.split() | |
def serial(sigs): | |
ok = True | |
for sig, path in sigs: | |
if file_sha256(path) != sig: | |
print(path) | |
ok = False | |
return ok | |
@contextmanager | |
def timed(name): | |
start = monotonic() | |
yield | |
duration = monotonic() - start | |
print(f'{name}: {duration:.3f}sec') | |
if __name__ == '__main__': | |
root = '/tmp/taxi' | |
sigs = list(parse_sig_file(f'{root}/sha256sum.txt')) | |
sigs = [(sig, f'{root}/{path}.bz2') for sig, path in sigs] | |
with timed('serial'): | |
serial(sigs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment