Created
May 1, 2017 04:49
-
-
Save hughdbrown/304dfc7e15e717bfda4b1ded94244c8d to your computer and use it in GitHub Desktop.
Python code to remove duplicate files with soft links. Good for video courses that put the same course into multiple directories.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
from hashlib import sha1 | |
from collections import defaultdict | |
import os.path | |
import os | |
def calc_sha(filename, size=256 * 1000 * 1000): | |
""" | |
Calculate the sha1 hash of a file. Read in blocks. | |
""" | |
sh = sha1() | |
with open(filename, "rb") as f: | |
while True: | |
s = f.read(size) | |
if not s: | |
break | |
sh.update(s) | |
return sh.hexdigest() | |
def fullpathiter(startdir): | |
return [ | |
os.path.normpath(os.path.join(root, f)) | |
for root, _, files in os.walk(startdir) | |
for f in files | |
] | |
def fixer(startdir='.'): | |
def fn(filename): | |
# All arguments that are file names or paths | |
# in a format statement have to have single quotes | |
# escaped so that the python single quotes | |
# enclosing are not terminated unsyntactically. | |
return filename.replace("'", r"\'") | |
d = defaultdict(list) | |
for fullpath in fullpathiter(startdir): | |
if not os.path.islink(fullpath): | |
file_hash = calc_sha(fullpath) | |
d[file_hash].append({ | |
'name': fullpath, | |
}) | |
for key, values in d.items(): | |
if len(values) > 1: | |
print('# ', '-' * 30, key) | |
# Split into original and duplicates | |
original, duplicates = ( | |
values[0]['name'], | |
[x['name'] for x in values[1:]] | |
) | |
old_filename = os.path.split(original)[1] | |
# Give the duplicate file a name in the storage directory | |
# so that different files of that name in different | |
# directories could be linked in the storage directory. | |
new_filename = "{0}-{1}".format(key, old_filename) | |
new_filepath = os.path.join("data", new_filename) | |
# Delete the duplicates first. | |
# Has the beneficial side effect of freeing up space, | |
# making it possible to create a directory in the next step. | |
for duplicate in duplicates: | |
print( | |
"rm '{0}'". | |
format(fn(duplicate)) | |
) | |
# Make a storage directory to store file | |
print("mkdir -p data") | |
# Move the single existing copy to the storage directory | |
print( | |
"mv '{0}' '{1}'". | |
format(fn(original), fn(new_filepath)) | |
) | |
# Link all the former copies to the renamed file | |
# in the storage directory | |
for duplicate in [original] + duplicates: | |
linked_dir = os.path.split(duplicate)[0] | |
linked_filepath = os.path.join(linked_dir, old_filename) | |
print( | |
"ln -s '{0}' '{1}'". | |
format(fn(new_filepath), fn(linked_filepath)) | |
) | |
print('# ', '-' * 30) | |
if __name__ == '__main__': | |
fixer('.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Since the file operations are drastic, the code generates a bash script that can be reviewed before running.