Skip to content

Instantly share code, notes, and snippets.

@mjordan
Last active August 29, 2015 14:07
Show Gist options
  • Save mjordan/4513687555ac54ee4b10 to your computer and use it in GitHub Desktop.
Save mjordan/4513687555ac54ee4b10 to your computer and use it in GitHub Desktop.
Walks a directory and generates checksums
import os
# Supports md5, sha1, sha256, and sha512
import hashlib
start_dir = '/home/mark/Pictures'
def generate_checksum_for_file(source_file_path, algorithm='md5'):
"""Generate and return a checksum for the file at source_file_path
using the specified algorithm.
"""
source_file = open(source_file_path, 'rb')
checksum = hashlib.new(algorithm)
while True:
# Read the file in 10 mb chunks to keep memory usage low.
buf = source_file.read(10 * 1024 * 1024)
if not buf:
break
checksum.update(buf)
source_file.close()
return checksum.hexdigest()
def generate_source_checksums(source_dir_path, algorithm='md5'):
"""Walk the directory at source_dir_path and generate a checksum for
each file found. Return a dict containing checksum: path_to_file entries.
"""
source_dir_checksums = {}
for root, dirnames, filenames in os.walk(start_dir):
for filename in filenames:
path_to_file = os.path.join(root, filename)
checksum_value = generate_checksum_for_file(path_to_file, 'md5')
source_dir_checksums[checksum_value] = path_to_file
return source_dir_checksums
file_list = generate_source_checksums(start_dir, 'md5')
print file_list
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment