Last active
October 7, 2024 18:20
-
-
Save se7enack/abdbec1bc501339d68c20cb22fc9bd17 to your computer and use it in GitHub Desktop.
deletes duplicate files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from collections import defaultdict | |
| import subprocess | |
| basedir = "/Users/user/Desktop/files" | |
| dict1 = defaultdict(list) | |
| dict2 = {} | |
| count = 0 | |
| output = subprocess.check_output(['find', basedir, '-type', 'f'], text=True) | |
| # Makes a dictonary of all files and their md5 checksums | |
| for line in output.splitlines(): | |
| md5 = subprocess.check_output(['md5', '-q', line], text=True) | |
| dict1[md5.strip()].append(line) | |
| # Looks for checksums associated to 2 or more files and creates a dictionary of them | |
| for value in dict1.values(): | |
| if len(value) > 1: | |
| dict2[count] = value | |
| count = count + 1 | |
| # Deletes every copy of those file less the first occurance and moves the remaining copy to an originals folder | |
| subprocess.check_output(['mkdir', '-p', basedir + "/originals"], text=True) | |
| for k, v in dict2.items(): | |
| index = range(len(v)-1) | |
| for s in index: | |
| subprocess.check_output(['rm', '-f', v[s]], text=True) | |
| print(f"deleting duplicate file {v[s]}") | |
| print(f'Original copy moved from {v[len(v)-1]} to {basedir + "/originals/"}') | |
| subprocess.check_output(['mv', v[len(v)-1], basedir + "/originals/."], text=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment