yig · May 14, 2015 00:49
diff --git a/deduplicate.py b/deduplicate.py
 #!/usr/bin/env python

 import os, hashlib

 kDryRun = True

 ## From: http://stackoverflow.com/questions/1131220/get-md5-hash-of-a-files-without-open-it-in-python
 def md5_for_file(f, block_size=2**20):
    md5 = hashlib.md5()
    while True:
        data = f.read(block_size)
        if not data:
            break
        md5.update(data)
    return md5.digest()

 def deduplicate( root ):
    '''
    Given a directory, search for files contained within
    that have the same name and contents and
    turn them into hard links of each other.
    '''
    
    ## These cached functions should live inside a single call to
    ## deduplicate() so that they don't cache the values longer than needed,
    ## which is a kind of memory leak.
    _filepath2hash = {}
    def filepath2hash( filepath ):
        if filepath not in _filepath2hash:
            _filepath2hash[ filepath ] = md5_for_file( open( filepath ) )
        
        return _filepath2hash[ filepath ]
    
    _filepath2bytes = {}
    def filepath2bytes( filepath ):
        if filepath not in _filepath2bytes:
            _filepath2bytes[ filepath ] = md5_for_file( open( filepath ) )
        
        return _filepath2hash[ filepath ]
    
    _filepath2stat = {}
    def filepath2stat( filepath ):
        if filepath not in _filepath2stat:
            mode = os.lstat( filepath )
            _filepath2stat[ filepath ] = mode
        
        return _filepath2stat[ filepath ]
    def filepath2inode( filepath ):
        return filepath2stat( filepath ).st_ino
    def filepath2device( filapath ):
        ## http://stackoverflow.com/questions/970742/is-a-file-on-the-same-filesystem-as-another-file-in-python
        return filepath2stat( filepath ).st_dev
    def filepath2bytes( filepath ):
        return filepath2stat( filepath ).st_size
    
    def samecontents( filepath1, filepath2 ):
        return( filepath2hash( filepath1 ) == filepath2hash( filepath2 )
                and
                filepath2hash( filepath1 ) == filepath2hash( filepath2 )
                )
    
    print 'Deduplicating%s: "%s"' % ( '' if not kDryRun else ' (dry run)', root )
    
    filename2dirpaths = {}
    
    for dirpath, dirnames, filenames in os.walk( root ):
        for fname in filenames:
            
            ## If we've already seen a file with the same name,
            ## check if they are the same inode.
            ## TODO Q: Should I first go through and collect all same-named
            ##         files; and then do a second pass and find the
            ##         same-content files, and then calculate which
            ##         inodes are the most frequent, and only then
            ##         do the hard linking so that I can do as little
            ##         hard linking as possible?
            if fname in filename2dirpaths:
                
                filepath = os.path.join( dirpath, fname )
                
                ## Now we need md5s and inodes for the files.
                for dp in filename2dirpaths[ fname ]:
                    
                    fp = os.path.join( dp, fname )
                    
                    ## If the inodes don't match but the contents do,
                    ## and the contents are on the same device,
                    ## turn the files into hardlinks.
                    if(
                        filepath2inode( filepath ) != filepath2inode( fp )
                        and
                        filepath2device( filepath ) == filepath2device( fp )
                        and
                        samecontents( filepath, fp )
                        ):
                        
                        print 'Turning "%s" into a hard link of "%s"' % ( filepath, fp )
                        #print 'Deduplicating "%s"' % ( fname, )
                        if not kDryRun:
                            ## Put the os.link() inside a finally
                            ## to try to be atomic in case we get
                            ## a KeyboardInterrupt or some other external
                            ## signal.
                            try:
                                os.unlink( filepath )
                            finally:
                                os.link( fp, filepath )
                        
                        ## Update the stat information.
                        _filepath2stat[ filepath ] = _filepath2stat[ fp ]
                        
                        ## We succeeded; we don't need to keep looking anymore.
                        break
            
            ## Otherwise, add the new filename.
            else:
                filename2dirpaths.setdefault( fname, [] ).append( dirpath )

 def main():
    global kDryRun
    
    import sys
    
    def usage():
        print >> sys.stderr, 'Usage:', sys.argv[0], '[--really] /path/to/root [/path/to/another/root ...]'
        sys.exit(-1)
    
    argv = list( sys.argv )
    del argv[0]
    
    if len( argv ) == 0:
        usage()
    
    if argv[0] == '--really':
        kDryRun = False
        del argv[0]
    
    if len( argv ) == 0:
        usage()
    
    #root = '/mixed media/web/news'
    for root in argv:
        deduplicate( root )

 if __name__ == '__main__': main()
	#!/usr/bin/env python

	import os, hashlib

	kDryRun = True

	## From: http://stackoverflow.com/questions/1131220/get-md5-hash-of-a-files-without-open-it-in-python
	def md5_for_file(f, block_size=2**20):
	md5 = hashlib.md5()
	while True:
	data = f.read(block_size)
	if not data:
	break
	md5.update(data)
	return md5.digest()

	def deduplicate( root ):
	'''
	Given a directory, search for files contained within
	that have the same name and contents and
	turn them into hard links of each other.
	'''

	## These cached functions should live inside a single call to
	## deduplicate() so that they don't cache the values longer than needed,
	## which is a kind of memory leak.
	_filepath2hash = {}
	def filepath2hash( filepath ):
	if filepath not in _filepath2hash:
	_filepath2hash[ filepath ] = md5_for_file( open( filepath ) )

	return _filepath2hash[ filepath ]

	_filepath2bytes = {}
	def filepath2bytes( filepath ):
	if filepath not in _filepath2bytes:
	_filepath2bytes[ filepath ] = md5_for_file( open( filepath ) )

	return _filepath2hash[ filepath ]

	_filepath2stat = {}
	def filepath2stat( filepath ):
	if filepath not in _filepath2stat:
	mode = os.lstat( filepath )
	_filepath2stat[ filepath ] = mode

	return _filepath2stat[ filepath ]
	def filepath2inode( filepath ):
	return filepath2stat( filepath ).st_ino
	def filepath2device( filapath ):
	## http://stackoverflow.com/questions/970742/is-a-file-on-the-same-filesystem-as-another-file-in-python
	return filepath2stat( filepath ).st_dev
	def filepath2bytes( filepath ):
	return filepath2stat( filepath ).st_size

	def samecontents( filepath1, filepath2 ):
	return( filepath2hash( filepath1 ) == filepath2hash( filepath2 )
	and
	filepath2hash( filepath1 ) == filepath2hash( filepath2 )
	)

	print 'Deduplicating%s: "%s"' % ( '' if not kDryRun else ' (dry run)', root )

	filename2dirpaths = {}

	for dirpath, dirnames, filenames in os.walk( root ):
	for fname in filenames:

	## If we've already seen a file with the same name,
	## check if they are the same inode.
	## TODO Q: Should I first go through and collect all same-named
	## files; and then do a second pass and find the
	## same-content files, and then calculate which
	## inodes are the most frequent, and only then
	## do the hard linking so that I can do as little
	## hard linking as possible?
	if fname in filename2dirpaths:

	filepath = os.path.join( dirpath, fname )

	## Now we need md5s and inodes for the files.
	for dp in filename2dirpaths[ fname ]:

	fp = os.path.join( dp, fname )

	## If the inodes don't match but the contents do,
	## and the contents are on the same device,
	## turn the files into hardlinks.
	if(
	filepath2inode( filepath ) != filepath2inode( fp )
	and
	filepath2device( filepath ) == filepath2device( fp )
	and
	samecontents( filepath, fp )
	):

	print 'Turning "%s" into a hard link of "%s"' % ( filepath, fp )
	#print 'Deduplicating "%s"' % ( fname, )
	if not kDryRun:
	## Put the os.link() inside a finally
	## to try to be atomic in case we get
	## a KeyboardInterrupt or some other external
	## signal.
	try:
	os.unlink( filepath )
	finally:
	os.link( fp, filepath )

	## Update the stat information.
	_filepath2stat[ filepath ] = _filepath2stat[ fp ]

	## We succeeded; we don't need to keep looking anymore.
	break

	## Otherwise, add the new filename.
	else:
	filename2dirpaths.setdefault( fname, [] ).append( dirpath )

	def main():
	global kDryRun

	import sys

	def usage():
	print >> sys.stderr, 'Usage:', sys.argv[0], '[--really] /path/to/root [/path/to/another/root ...]'
	sys.exit(-1)

	argv = list( sys.argv )
	del argv[0]

	if len( argv ) == 0:
	usage()

	if argv[0] == '--really':
	kDryRun = False
	del argv[0]

	if len( argv ) == 0:
	usage()

	#root = '/mixed media/web/news'
	for root in argv:
	deduplicate( root )

	if __name__ == '__main__': main()