Created
May 14, 2015 00:49
-
-
Save yig/1f6dfa09fd1928d078c6 to your computer and use it in GitHub Desktop.
Replaces duplicate files (based on md5) with hard links. Use only if you understand the ramifications of this. Runs in dry-run mode by default.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os, hashlib | |
kDryRun = True | |
## From: http://stackoverflow.com/questions/1131220/get-md5-hash-of-a-files-without-open-it-in-python | |
def md5_for_file(f, block_size=2**20): | |
md5 = hashlib.md5() | |
while True: | |
data = f.read(block_size) | |
if not data: | |
break | |
md5.update(data) | |
return md5.digest() | |
def deduplicate( root ): | |
''' | |
Given a directory, search for files contained within | |
that have the same name and contents and | |
turn them into hard links of each other. | |
''' | |
## These cached functions should live inside a single call to | |
## deduplicate() so that they don't cache the values longer than needed, | |
## which is a kind of memory leak. | |
_filepath2hash = {} | |
def filepath2hash( filepath ): | |
if filepath not in _filepath2hash: | |
_filepath2hash[ filepath ] = md5_for_file( open( filepath ) ) | |
return _filepath2hash[ filepath ] | |
_filepath2bytes = {} | |
def filepath2bytes( filepath ): | |
if filepath not in _filepath2bytes: | |
_filepath2bytes[ filepath ] = md5_for_file( open( filepath ) ) | |
return _filepath2hash[ filepath ] | |
_filepath2stat = {} | |
def filepath2stat( filepath ): | |
if filepath not in _filepath2stat: | |
mode = os.lstat( filepath ) | |
_filepath2stat[ filepath ] = mode | |
return _filepath2stat[ filepath ] | |
def filepath2inode( filepath ): | |
return filepath2stat( filepath ).st_ino | |
def filepath2device( filapath ): | |
## http://stackoverflow.com/questions/970742/is-a-file-on-the-same-filesystem-as-another-file-in-python | |
return filepath2stat( filepath ).st_dev | |
def filepath2bytes( filepath ): | |
return filepath2stat( filepath ).st_size | |
def samecontents( filepath1, filepath2 ): | |
return( filepath2hash( filepath1 ) == filepath2hash( filepath2 ) | |
and | |
filepath2hash( filepath1 ) == filepath2hash( filepath2 ) | |
) | |
print 'Deduplicating%s: "%s"' % ( '' if not kDryRun else ' (dry run)', root ) | |
filename2dirpaths = {} | |
for dirpath, dirnames, filenames in os.walk( root ): | |
for fname in filenames: | |
## If we've already seen a file with the same name, | |
## check if they are the same inode. | |
## TODO Q: Should I first go through and collect all same-named | |
## files; and then do a second pass and find the | |
## same-content files, and then calculate which | |
## inodes are the most frequent, and only then | |
## do the hard linking so that I can do as little | |
## hard linking as possible? | |
if fname in filename2dirpaths: | |
filepath = os.path.join( dirpath, fname ) | |
## Now we need md5s and inodes for the files. | |
for dp in filename2dirpaths[ fname ]: | |
fp = os.path.join( dp, fname ) | |
## If the inodes don't match but the contents do, | |
## and the contents are on the same device, | |
## turn the files into hardlinks. | |
if( | |
filepath2inode( filepath ) != filepath2inode( fp ) | |
and | |
filepath2device( filepath ) == filepath2device( fp ) | |
and | |
samecontents( filepath, fp ) | |
): | |
print 'Turning "%s" into a hard link of "%s"' % ( filepath, fp ) | |
#print 'Deduplicating "%s"' % ( fname, ) | |
if not kDryRun: | |
## Put the os.link() inside a finally | |
## to try to be atomic in case we get | |
## a KeyboardInterrupt or some other external | |
## signal. | |
try: | |
os.unlink( filepath ) | |
finally: | |
os.link( fp, filepath ) | |
## Update the stat information. | |
_filepath2stat[ filepath ] = _filepath2stat[ fp ] | |
## We succeeded; we don't need to keep looking anymore. | |
break | |
## Otherwise, add the new filename. | |
else: | |
filename2dirpaths.setdefault( fname, [] ).append( dirpath ) | |
def main(): | |
global kDryRun | |
import sys | |
def usage(): | |
print >> sys.stderr, 'Usage:', sys.argv[0], '[--really] /path/to/root [/path/to/another/root ...]' | |
sys.exit(-1) | |
argv = list( sys.argv ) | |
del argv[0] | |
if len( argv ) == 0: | |
usage() | |
if argv[0] == '--really': | |
kDryRun = False | |
del argv[0] | |
if len( argv ) == 0: | |
usage() | |
#root = '/mixed media/web/news' | |
for root in argv: | |
deduplicate( root ) | |
if __name__ == '__main__': main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment