Skip to content

Instantly share code, notes, and snippets.

@larsyencken
Created November 8, 2013 06:02
Show Gist options
  • Save larsyencken/7366865 to your computer and use it in GitHub Desktop.
Save larsyencken/7366865 to your computer and use it in GitHub Desktop.
Walk a local directory and dump JSON records of all the duplicate files.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# find_dups.py
#
import optparse
import json
import hashlib
import os
import sys
from collections import defaultdict
def find_dups(dirname):
print >> sys.stderr, 'Scanning...'
dups = defaultdict(set)
for path, dirnames, filenames in os.walk(dirname):
for f in filenames:
filename = os.path.join(path, f)
h = hashlib.md5()
with open(filename) as istream:
chunk = istream.read(2**20)
while chunk:
h.update(chunk)
chunk = istream.read(2**20)
md5 = h.hexdigest()
dups[md5].add(filename)
print >> sys.stderr, 'Listing duplicates...'
for md5, files in dups.iteritems():
if len(files) > 1:
print json.dumps({'md5': md5, 'files': sorted(files)})
print >> sys.stderr, 'Done'
def _create_option_parser():
usage = \
"""%prog [options] directory
List all duplicate files in the given directory and its subdirs.""" # nopep8
parser = optparse.OptionParser(usage)
return parser
def main(argv):
parser = _create_option_parser()
(options, args) = parser.parse_args(argv)
if len(args) != 1:
parser.print_help()
sys.exit(1)
find_dups(*args)
if __name__ == '__main__':
main(sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment