Skip to content

Instantly share code, notes, and snippets.

@sirex
Created November 16, 2015 10:29
Show Gist options
  • Save sirex/494e0afc00969edd05c9 to your computer and use it in GitHub Desktop.
Save sirex/494e0afc00969edd05c9 to your computer and use it in GitHub Desktop.
Script to check SHA1 collisions.
"""
Script to check SHA1 collisions.
Test 100,000,000 hashes::
> python sha1test.py sha1test.data 100000000
Number of hashes to check : 100,000,000
Estimated disk space needed: 14,700,000,000 bytes.
generate: 100%|##################| 100000000/100000000 [06:05<00:00, 273917.86it/s]
sort... done in 723.8038 seconds.
search... done in 73.4733 seconds.
Number of collisions found: 0
> du -sh sha1test.data*
4,7G sha1test.data
0 sha1test.data.collisions
4,7G sha1test.data.sort
"""
import argparse
import hashlib
import math
import os
import pathlib
import subprocess
import sys
import time
import tqdm
def main():
parser = argparse.ArgumentParser()
parser.add_argument('output', help='Path to a file where all generated hases will be stored.')
parser.add_argument('count', type=int, help='Number of hashes to generate.')
args = parser.parse_args()
input_ = pathlib.Path(args.output)
output = pathlib.Path(args.output + '.sort')
result = pathlib.Path(args.output + '.collisions')
expected_size = (40 + math.ceil(math.log(args.count, 10)) + 1) * args.count * 3 # 3 for input, output and sort
vfs = os.statvfs(str(input_.parent))
available_disk_size = vfs.f_bsize * vfs.f_bfree + 10 * 1024**2
if expected_size > available_disk_size:
print("Not enaugh disk space. Required: {:,}, available {:,}.".format(expected_size, available_disk_size))
sys.exit(1)
else:
print('Number of hashes to check : {:,}'.format(args.count))
print('Estimated disk space needed: {:,} bytes.'.format(expected_size))
print()
with input_.open('w') as f:
for i in tqdm.tqdm(range(args.count), 'generate', total=args.count, leave=True):
h = hashlib.sha1(str(i).encode()).hexdigest()
f.write('%s:%s\n' % (h, i))
print('sort... ', end='', flush=True)
start = time.time()
subprocess.check_call(['sort', str(input_), '--temporary-directory=%s' % input_.parent, '--output=%s' % output])
print('done in %.4f seconds.' % (time.time() - start))
print('search... ', end='', flush=True)
start = time.time()
subprocess.check_call(['uniq', '--repeated', '--check-chars=40', str(output), str(result)])
print('done in %.4f seconds.' % (time.time() - start))
print()
print('Number of collisions found: %s' % subprocess.check_output(['wc', '-l', str(result)]).decode().split()[0])
print()
print('First 42 lines from result file:')
subprocess.check_call(['head', '--lines=100', str(result)])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment