Created
March 2, 2012 02:16
-
-
Save jorendorff/1954974 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This program is valid Python 2.7 and it's also valid Python 3.2. | |
# To run it, type: python hashfiles.py | |
import os, hashlib, collections | |
# What is a defaultdict? Some sort of data structure, apparently. Hmm. | |
file_groups = collections.defaultdict(list) | |
# This loop is going to execute once for each directory in the tree. | |
for dirpath, dirnames, filenames in os.walk('.'): | |
# The inner loop will run once for each file in the tree. | |
for f in filenames: | |
# Let's just try reading the entire file as one huge string. Why not? | |
path = os.path.join(dirpath, f) | |
try: | |
with open(path, 'rb') as f: | |
data = f.read() | |
except Exception as exc: | |
# Oh dear, some sort of error happpened. Just make a note of it and | |
# move on. | |
print("skipping: " + path + " because: " + str(exc)) | |
continue | |
# Now take that data and hash it. | |
filehash = hashlib.sha256(data).hexdigest() | |
# Now file_groups[filehash] is a list. Add path to that list. | |
file_groups[filehash].append(path) | |
# At this point, file_groups contains lots of lists. Each list contains one or | |
# more files. Now sort each group, so the output is in a nice order. | |
groups = list(file_groups.values()) | |
for filelist in groups: | |
filelist.sort() | |
groups.sort() | |
# Print out every group, even groups with just one entry. | |
for filelist in groups: | |
print(filelist[0]) | |
for f in filelist[1:]: | |
print(" = " + f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment