Created
November 16, 2015 10:29
-
-
Save sirex/494e0afc00969edd05c9 to your computer and use it in GitHub Desktop.
Script to check SHA1 collisions.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script to check SHA1 collisions. | |
Test 100,000,000 hashes:: | |
> python sha1test.py sha1test.data 100000000 | |
Number of hashes to check : 100,000,000 | |
Estimated disk space needed: 14,700,000,000 bytes. | |
generate: 100%|##################| 100000000/100000000 [06:05<00:00, 273917.86it/s] | |
sort... done in 723.8038 seconds. | |
search... done in 73.4733 seconds. | |
Number of collisions found: 0 | |
> du -sh sha1test.data* | |
4,7G sha1test.data | |
0 sha1test.data.collisions | |
4,7G sha1test.data.sort | |
""" | |
import argparse | |
import hashlib | |
import math | |
import os | |
import pathlib | |
import subprocess | |
import sys | |
import time | |
import tqdm | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('output', help='Path to a file where all generated hases will be stored.') | |
parser.add_argument('count', type=int, help='Number of hashes to generate.') | |
args = parser.parse_args() | |
input_ = pathlib.Path(args.output) | |
output = pathlib.Path(args.output + '.sort') | |
result = pathlib.Path(args.output + '.collisions') | |
expected_size = (40 + math.ceil(math.log(args.count, 10)) + 1) * args.count * 3 # 3 for input, output and sort | |
vfs = os.statvfs(str(input_.parent)) | |
available_disk_size = vfs.f_bsize * vfs.f_bfree + 10 * 1024**2 | |
if expected_size > available_disk_size: | |
print("Not enaugh disk space. Required: {:,}, available {:,}.".format(expected_size, available_disk_size)) | |
sys.exit(1) | |
else: | |
print('Number of hashes to check : {:,}'.format(args.count)) | |
print('Estimated disk space needed: {:,} bytes.'.format(expected_size)) | |
print() | |
with input_.open('w') as f: | |
for i in tqdm.tqdm(range(args.count), 'generate', total=args.count, leave=True): | |
h = hashlib.sha1(str(i).encode()).hexdigest() | |
f.write('%s:%s\n' % (h, i)) | |
print('sort... ', end='', flush=True) | |
start = time.time() | |
subprocess.check_call(['sort', str(input_), '--temporary-directory=%s' % input_.parent, '--output=%s' % output]) | |
print('done in %.4f seconds.' % (time.time() - start)) | |
print('search... ', end='', flush=True) | |
start = time.time() | |
subprocess.check_call(['uniq', '--repeated', '--check-chars=40', str(output), str(result)]) | |
print('done in %.4f seconds.' % (time.time() - start)) | |
print() | |
print('Number of collisions found: %s' % subprocess.check_output(['wc', '-l', str(result)]).decode().split()[0]) | |
print() | |
print('First 42 lines from result file:') | |
subprocess.check_call(['head', '--lines=100', str(result)]) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment