Skip to content

Instantly share code, notes, and snippets.

@Lysander
Created April 20, 2013 08:31
Show Gist options
  • Select an option

  • Save Lysander/5425251 to your computer and use it in GitHub Desktop.

Select an option

Save Lysander/5425251 to your computer and use it in GitHub Desktop.
23498239482938492384078789789789 bla/blub/datei1.dat
d41d8cd98f00b204e9800998ecf8427e ga/bla2345.txt
b9dd393b05195aa2288bd1611d7fc361 bluber/bla4325.txt
d41d8cd98f00b204e9800998ecf8427e data/ghghg/mfdfdf.dat
57dd7f249a99923eea234e62895dc03f test.txt
d41d8cd98f00b204e9800998ecf8427e data/anderer_name_als_mfdfdf.dat
#!/usr/bin/env python
# coding: utf-8
import argparse
from contextlib import closing
from collections import defaultdict
from pprint import pprint
def find_doubles(rows):
doubles = defaultdict(list)
entries = defaultdict(list)
for index, row in enumerate(rows):
md5sum, path = row.strip().split()
entries[md5sum].append((path, index))
for key, value in entries.iteritems():
if len(value) > 1:
doubles[key] = value
return doubles
def main():
parser = argparse.ArgumentParser(description='Find MD5 doubles.')
parser.add_argument('--input', type=argparse.FileType('r'), default='-')
args = parser.parse_args()
with closing(args.input) as infile:
doubles = find_doubles(infile)
for key, values in doubles.iteritems():
print u"{} Einträge:".format(len(values))
print u"\n".join("{}: {} {}".format(line, key, path)
for path, line in values)
print
if __name__ == "__main__":
main()
> python doubles.py --input data.txt
3 Einträge:
1: d41d8cd98f00b204e9800998ecf8427e ga/bla2345.txt
3: d41d8cd98f00b204e9800998ecf8427e data/ghghg/mfdfdf.dat
5: d41d8cd98f00b204e9800998ecf8427e data/anderer_name_als_mfdfdf.dat
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment