Created
November 8, 2013 06:02
-
-
Save larsyencken/7366865 to your computer and use it in GitHub Desktop.
Walk a local directory and dump JSON records of all the duplicate files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# find_dups.py | |
# | |
import optparse | |
import json | |
import hashlib | |
import os | |
import sys | |
from collections import defaultdict | |
def find_dups(dirname): | |
print >> sys.stderr, 'Scanning...' | |
dups = defaultdict(set) | |
for path, dirnames, filenames in os.walk(dirname): | |
for f in filenames: | |
filename = os.path.join(path, f) | |
h = hashlib.md5() | |
with open(filename) as istream: | |
chunk = istream.read(2**20) | |
while chunk: | |
h.update(chunk) | |
chunk = istream.read(2**20) | |
md5 = h.hexdigest() | |
dups[md5].add(filename) | |
print >> sys.stderr, 'Listing duplicates...' | |
for md5, files in dups.iteritems(): | |
if len(files) > 1: | |
print json.dumps({'md5': md5, 'files': sorted(files)}) | |
print >> sys.stderr, 'Done' | |
def _create_option_parser(): | |
usage = \ | |
"""%prog [options] directory | |
List all duplicate files in the given directory and its subdirs.""" # nopep8 | |
parser = optparse.OptionParser(usage) | |
return parser | |
def main(argv): | |
parser = _create_option_parser() | |
(options, args) = parser.parse_args(argv) | |
if len(args) != 1: | |
parser.print_help() | |
sys.exit(1) | |
find_dups(*args) | |
if __name__ == '__main__': | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment