Skip to content

Instantly share code, notes, and snippets.

@tebeka
Created June 13, 2020 13:21
Show Gist options
  • Save tebeka/9affc689e9e8f40ed1fa4b2c9f9f60fb to your computer and use it in GitHub Desktop.
Save tebeka/9affc689e9e8f40ed1fa4b2c9f9f60fb to your computer and use it in GitHub Desktop.
Check sha256 of bzipped files
"""Checking sha25 signatures
Write a function that gets an index file with names of files and sha256
signatures in the following format
0c4ccc63a912bbd6d45174251415c089522e5c0e75286794ab1f86cb8e2561fd taxi-01.csv
f427b5880e9164ec1e6cda53aa4b2d1f1e470da973e5b51748c806ea5c57cbdf taxi-02.csv
4e251e9e98c5cb7be8b34adfcb46cc806a4ef5ec8c95ba9aac5ff81449fc630c taxi-03.csv
...
You should compute concurrently sha256 signatures of these files and see if
they math the ones in the index file.
- Print the number of processed files
- If there's a mismatch, print the offending file(s) and exit the program with
non-zero value
Get taxi.zip from the web site and open it. The index file is sha256sum.txt
"""
import bz2
from contextlib import contextmanager
from functools import partial
from hashlib import sha256
from time import monotonic
chunk_size = 10 * 1024
def file_sha256(path):
h = sha256()
with bz2.open(path) as fp:
for chunk in iter(partial(fp.read, chunk_size), b''):
h.update(chunk)
return h.hexdigest()
def parse_sig_file(path):
with open(path) as fp:
for line in fp:
yield line.split()
def serial(sigs):
ok = True
for sig, path in sigs:
if file_sha256(path) != sig:
print(path)
ok = False
return ok
@contextmanager
def timed(name):
start = monotonic()
yield
duration = monotonic() - start
print(f'{name}: {duration:.3f}sec')
if __name__ == '__main__':
root = '/tmp/taxi'
sigs = list(parse_sig_file(f'{root}/sha256sum.txt'))
sigs = [(sig, f'{root}/{path}.bz2') for sig, path in sigs]
with timed('serial'):
serial(sigs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment