Skip to content

Instantly share code, notes, and snippets.

@yig
Created May 14, 2015 00:49
Show Gist options
  • Save yig/1f6dfa09fd1928d078c6 to your computer and use it in GitHub Desktop.
Save yig/1f6dfa09fd1928d078c6 to your computer and use it in GitHub Desktop.
Replaces duplicate files (based on md5) with hard links. Use only if you understand the ramifications of this. Runs in dry-run mode by default.
#!/usr/bin/env python
import os, hashlib
kDryRun = True
## From: http://stackoverflow.com/questions/1131220/get-md5-hash-of-a-files-without-open-it-in-python
def md5_for_file(f, block_size=2**20):
md5 = hashlib.md5()
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
return md5.digest()
def deduplicate( root ):
'''
Given a directory, search for files contained within
that have the same name and contents and
turn them into hard links of each other.
'''
## These cached functions should live inside a single call to
## deduplicate() so that they don't cache the values longer than needed,
## which is a kind of memory leak.
_filepath2hash = {}
def filepath2hash( filepath ):
if filepath not in _filepath2hash:
_filepath2hash[ filepath ] = md5_for_file( open( filepath ) )
return _filepath2hash[ filepath ]
_filepath2bytes = {}
def filepath2bytes( filepath ):
if filepath not in _filepath2bytes:
_filepath2bytes[ filepath ] = md5_for_file( open( filepath ) )
return _filepath2hash[ filepath ]
_filepath2stat = {}
def filepath2stat( filepath ):
if filepath not in _filepath2stat:
mode = os.lstat( filepath )
_filepath2stat[ filepath ] = mode
return _filepath2stat[ filepath ]
def filepath2inode( filepath ):
return filepath2stat( filepath ).st_ino
def filepath2device( filapath ):
## http://stackoverflow.com/questions/970742/is-a-file-on-the-same-filesystem-as-another-file-in-python
return filepath2stat( filepath ).st_dev
def filepath2bytes( filepath ):
return filepath2stat( filepath ).st_size
def samecontents( filepath1, filepath2 ):
return( filepath2hash( filepath1 ) == filepath2hash( filepath2 )
and
filepath2hash( filepath1 ) == filepath2hash( filepath2 )
)
print 'Deduplicating%s: "%s"' % ( '' if not kDryRun else ' (dry run)', root )
filename2dirpaths = {}
for dirpath, dirnames, filenames in os.walk( root ):
for fname in filenames:
## If we've already seen a file with the same name,
## check if they are the same inode.
## TODO Q: Should I first go through and collect all same-named
## files; and then do a second pass and find the
## same-content files, and then calculate which
## inodes are the most frequent, and only then
## do the hard linking so that I can do as little
## hard linking as possible?
if fname in filename2dirpaths:
filepath = os.path.join( dirpath, fname )
## Now we need md5s and inodes for the files.
for dp in filename2dirpaths[ fname ]:
fp = os.path.join( dp, fname )
## If the inodes don't match but the contents do,
## and the contents are on the same device,
## turn the files into hardlinks.
if(
filepath2inode( filepath ) != filepath2inode( fp )
and
filepath2device( filepath ) == filepath2device( fp )
and
samecontents( filepath, fp )
):
print 'Turning "%s" into a hard link of "%s"' % ( filepath, fp )
#print 'Deduplicating "%s"' % ( fname, )
if not kDryRun:
## Put the os.link() inside a finally
## to try to be atomic in case we get
## a KeyboardInterrupt or some other external
## signal.
try:
os.unlink( filepath )
finally:
os.link( fp, filepath )
## Update the stat information.
_filepath2stat[ filepath ] = _filepath2stat[ fp ]
## We succeeded; we don't need to keep looking anymore.
break
## Otherwise, add the new filename.
else:
filename2dirpaths.setdefault( fname, [] ).append( dirpath )
def main():
global kDryRun
import sys
def usage():
print >> sys.stderr, 'Usage:', sys.argv[0], '[--really] /path/to/root [/path/to/another/root ...]'
sys.exit(-1)
argv = list( sys.argv )
del argv[0]
if len( argv ) == 0:
usage()
if argv[0] == '--really':
kDryRun = False
del argv[0]
if len( argv ) == 0:
usage()
#root = '/mixed media/web/news'
for root in argv:
deduplicate( root )
if __name__ == '__main__': main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment