sirex · November 16, 2015 10:29
diff --git a/sha1check.py b/sha1check.py
 """
 Script to check SHA1 collisions.

 Test 100,000,000 hashes::

    > python sha1test.py sha1test.data 100000000
    Number of hashes to check  : 100,000,000
    Estimated disk space needed: 14,700,000,000 bytes.

    generate: 100%|##################| 100000000/100000000 [06:05<00:00, 273917.86it/s]
    sort...   done in 723.8038 seconds.
    search... done in 73.4733 seconds.

    Number of collisions found: 0

    > du -sh sha1test.data*
    4,7G sha1test.data
    0    sha1test.data.collisions
    4,7G sha1test.data.sort

 """

 import argparse
 import hashlib
 import math
 import os
 import pathlib
 import subprocess
 import sys
 import time
 import tqdm


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('output', help='Path to a file where all generated hases will be stored.')
    parser.add_argument('count', type=int, help='Number of hashes to generate.')
    args = parser.parse_args()

    input_ = pathlib.Path(args.output)
    output = pathlib.Path(args.output + '.sort')
    result = pathlib.Path(args.output + '.collisions')

    expected_size = (40 + math.ceil(math.log(args.count, 10)) + 1) * args.count * 3  # 3 for input, output and sort

    vfs = os.statvfs(str(input_.parent))
    available_disk_size = vfs.f_bsize * vfs.f_bfree + 10 * 1024**2

    if expected_size > available_disk_size:
        print("Not enaugh disk space. Required: {:,}, available {:,}.".format(expected_size, available_disk_size))
        sys.exit(1)
    else:
        print('Number of hashes to check  : {:,}'.format(args.count))
        print('Estimated disk space needed: {:,} bytes.'.format(expected_size))
        print()

        with input_.open('w') as f:
            for i in tqdm.tqdm(range(args.count), 'generate', total=args.count, leave=True):
                h = hashlib.sha1(str(i).encode()).hexdigest()
                f.write('%s:%s\n' % (h, i))

        print('sort...   ', end='', flush=True)
        start = time.time()
        subprocess.check_call(['sort', str(input_), '--temporary-directory=%s' % input_.parent, '--output=%s' % output])
        print('done in %.4f seconds.' % (time.time() - start))

        print('search... ', end='', flush=True)
        start = time.time()
        subprocess.check_call(['uniq', '--repeated', '--check-chars=40', str(output), str(result)])
        print('done in %.4f seconds.' % (time.time() - start))

        print()
        print('Number of collisions found: %s' % subprocess.check_output(['wc', '-l', str(result)]).decode().split()[0])

        print()
        print('First 42 lines from result file:')
        subprocess.check_call(['head', '--lines=100', str(result)])


 if __name__ == '__main__':
    main()
	"""
	Script to check SHA1 collisions.

	Test 100,000,000 hashes::

	> python sha1test.py sha1test.data 100000000
	Number of hashes to check : 100,000,000
	Estimated disk space needed: 14,700,000,000 bytes.

	generate: 100%\|##################\| 100000000/100000000 [06:05<00:00, 273917.86it/s]
	sort... done in 723.8038 seconds.
	search... done in 73.4733 seconds.

	Number of collisions found: 0

	> du -sh sha1test.data*
	4,7G sha1test.data
	0 sha1test.data.collisions
	4,7G sha1test.data.sort

	"""

	import argparse
	import hashlib
	import math
	import os
	import pathlib
	import subprocess
	import sys
	import time
	import tqdm


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('output', help='Path to a file where all generated hases will be stored.')
	parser.add_argument('count', type=int, help='Number of hashes to generate.')
	args = parser.parse_args()

	input_ = pathlib.Path(args.output)
	output = pathlib.Path(args.output + '.sort')
	result = pathlib.Path(args.output + '.collisions')

	expected_size = (40 + math.ceil(math.log(args.count, 10)) + 1) * args.count * 3 # 3 for input, output and sort

	vfs = os.statvfs(str(input_.parent))
	available_disk_size = vfs.f_bsize * vfs.f_bfree + 10 * 1024**2

	if expected_size > available_disk_size:
	print("Not enaugh disk space. Required: {:,}, available {:,}.".format(expected_size, available_disk_size))
	sys.exit(1)
	else:
	print('Number of hashes to check : {:,}'.format(args.count))
	print('Estimated disk space needed: {:,} bytes.'.format(expected_size))
	print()

	with input_.open('w') as f:
	for i in tqdm.tqdm(range(args.count), 'generate', total=args.count, leave=True):
	h = hashlib.sha1(str(i).encode()).hexdigest()
	f.write('%s:%s\n' % (h, i))

	print('sort... ', end='', flush=True)
	start = time.time()
	subprocess.check_call(['sort', str(input_), '--temporary-directory=%s' % input_.parent, '--output=%s' % output])
	print('done in %.4f seconds.' % (time.time() - start))

	print('search... ', end='', flush=True)
	start = time.time()
	subprocess.check_call(['uniq', '--repeated', '--check-chars=40', str(output), str(result)])
	print('done in %.4f seconds.' % (time.time() - start))

	print()
	print('Number of collisions found: %s' % subprocess.check_output(['wc', '-l', str(result)]).decode().split()[0])

	print()
	print('First 42 lines from result file:')
	subprocess.check_call(['head', '--lines=100', str(result)])


	if __name__ == '__main__':
	main()