Last active
August 29, 2015 14:07
-
-
Save mjordan/4513687555ac54ee4b10 to your computer and use it in GitHub Desktop.
Walks a directory and generates checksums
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
# Supports md5, sha1, sha256, and sha512 | |
import hashlib | |
start_dir = '/home/mark/Pictures' | |
def generate_checksum_for_file(source_file_path, algorithm='md5'): | |
"""Generate and return a checksum for the file at source_file_path | |
using the specified algorithm. | |
""" | |
source_file = open(source_file_path, 'rb') | |
checksum = hashlib.new(algorithm) | |
while True: | |
# Read the file in 10 mb chunks to keep memory usage low. | |
buf = source_file.read(10 * 1024 * 1024) | |
if not buf: | |
break | |
checksum.update(buf) | |
source_file.close() | |
return checksum.hexdigest() | |
def generate_source_checksums(source_dir_path, algorithm='md5'): | |
"""Walk the directory at source_dir_path and generate a checksum for | |
each file found. Return a dict containing checksum: path_to_file entries. | |
""" | |
source_dir_checksums = {} | |
for root, dirnames, filenames in os.walk(start_dir): | |
for filename in filenames: | |
path_to_file = os.path.join(root, filename) | |
checksum_value = generate_checksum_for_file(path_to_file, 'md5') | |
source_dir_checksums[checksum_value] = path_to_file | |
return source_dir_checksums | |
file_list = generate_source_checksums(start_dir, 'md5') | |
print file_list |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment