yig · May 14, 2015 01:01
diff --git a/remove_duplicate_media.py b/remove_duplicate_media.py
 import sys
 from glob import glob
 import os
 import subprocess

 kHarmful = False

 argv = list( sys.argv )

 try:
    del argv[ argv.index( "--really" ) ]
    kHarmful = True
 except ValueError: pass

 glob_strings = argv[1:]

 if len( glob_strings ) == 0:
    print >> sys.stderr, 'Usage:', sys.argv[0], '[--really] glob_string1 ... glob_stringN'
    print >> sys.stderr, 'Example:', sys.argv[0], "'*.MOV' '*.PNG' '*.JPG'"
    print >> sys.stderr, "To find suffixes: ls -1 | cut -d '.' -f 2 | sort | uniq"
    sys.exit(-1)

 all_files = []
 for s in glob_strings:
    all_files.extend( glob( s ) )

 hash2fnames = {}
 print 'Scanning', len( all_files ), 'files...'
 for fname in all_files:
    ## TODO: Replace this md5 hashing with the 'identify' tool from ImageMagick, e.g.
    ##       identify -quiet -format "%#\n" path/to/image
    if os.path.splitext( fname.lower() )[-1] in ('.jpg', '.jpeg'):
        print 'Computing md5 of jpeg data of "%s"...' % ( fname, )
        ## md5sum of the jpeg data itself (ignoring metadata):
        djpeg = subprocess.Popen( [ 'djpeg', fname ], stdout = subprocess.PIPE )
        stdoutdata, stderrdata = subprocess.Popen( args = [ 'openssl', 'md5' ], stdin = djpeg.stdout, stdout = subprocess.PIPE ).communicate()
    else:
        print 'Computing md5 of "%s"...' % ( fname, )
        ## Direct md5sum
        stdoutdata, stderrdata = subprocess.Popen( args = [ 'openssl', 'md5', fname ], stdout = subprocess.PIPE ).communicate()
    
    hash = stdoutdata.split( ' ' )[-1]
    hash2fnames.setdefault( hash, set() ).add( fname )
 print 'Finished scanning.'

 count = 0
 for hash, fnames in hash2fnames.iteritems():
    if len( fnames ) > 1:
        #print list( fnames )
        dups = [ ( len( fname ), fname ) for fname in fnames ]
        dups.sort()
        dups = [ fname for ( length, fname ) in dups ]
        if len( dups ) > 1:
            print '# For the following rms, keeping: "%s"' % ( dups[0] )
        for dupname in dups[1:]:
            print 'rm "%s"' % ( dupname, )
            if kHarmful: os.remove( dupname )
        count += 1

 print count, 'duplicates found.'
	import sys
	from glob import glob
	import os
	import subprocess

	kHarmful = False

	argv = list( sys.argv )

	try:
	del argv[ argv.index( "--really" ) ]
	kHarmful = True
	except ValueError: pass

	glob_strings = argv[1:]

	if len( glob_strings ) == 0:
	print >> sys.stderr, 'Usage:', sys.argv[0], '[--really] glob_string1 ... glob_stringN'
	print >> sys.stderr, 'Example:', sys.argv[0], "'.MOV' '.PNG' '*.JPG'"
	print >> sys.stderr, "To find suffixes: ls -1 \| cut -d '.' -f 2 \| sort \| uniq"
	sys.exit(-1)

	all_files = []
	for s in glob_strings:
	all_files.extend( glob( s ) )

	hash2fnames = {}
	print 'Scanning', len( all_files ), 'files...'
	for fname in all_files:
	## TODO: Replace this md5 hashing with the 'identify' tool from ImageMagick, e.g.
	## identify -quiet -format "%#\n" path/to/image
	if os.path.splitext( fname.lower() )[-1] in ('.jpg', '.jpeg'):
	print 'Computing md5 of jpeg data of "%s"...' % ( fname, )
	## md5sum of the jpeg data itself (ignoring metadata):
	djpeg = subprocess.Popen( [ 'djpeg', fname ], stdout = subprocess.PIPE )
	stdoutdata, stderrdata = subprocess.Popen( args = [ 'openssl', 'md5' ], stdin = djpeg.stdout, stdout = subprocess.PIPE ).communicate()
	else:
	print 'Computing md5 of "%s"...' % ( fname, )
	## Direct md5sum
	stdoutdata, stderrdata = subprocess.Popen( args = [ 'openssl', 'md5', fname ], stdout = subprocess.PIPE ).communicate()

	hash = stdoutdata.split( ' ' )[-1]
	hash2fnames.setdefault( hash, set() ).add( fname )
	print 'Finished scanning.'

	count = 0
	for hash, fnames in hash2fnames.iteritems():
	if len( fnames ) > 1:
	#print list( fnames )
	dups = [ ( len( fname ), fname ) for fname in fnames ]
	dups.sort()
	dups = [ fname for ( length, fname ) in dups ]
	if len( dups ) > 1:
	print '# For the following rms, keeping: "%s"' % ( dups[0] )
	for dupname in dups[1:]:
	print 'rm "%s"' % ( dupname, )
	if kHarmful: os.remove( dupname )
	count += 1

	print count, 'duplicates found.'