Skip to content

Instantly share code, notes, and snippets.

@se7enack
Last active October 7, 2024 18:20
Show Gist options
  • Select an option

  • Save se7enack/abdbec1bc501339d68c20cb22fc9bd17 to your computer and use it in GitHub Desktop.

Select an option

Save se7enack/abdbec1bc501339d68c20cb22fc9bd17 to your computer and use it in GitHub Desktop.
deletes duplicate files
from collections import defaultdict
import subprocess
basedir = "/Users/user/Desktop/files"
dict1 = defaultdict(list)
dict2 = {}
count = 0
output = subprocess.check_output(['find', basedir, '-type', 'f'], text=True)
# Makes a dictonary of all files and their md5 checksums
for line in output.splitlines():
md5 = subprocess.check_output(['md5', '-q', line], text=True)
dict1[md5.strip()].append(line)
# Looks for checksums associated to 2 or more files and creates a dictionary of them
for value in dict1.values():
if len(value) > 1:
dict2[count] = value
count = count + 1
# Deletes every copy of those file less the first occurance and moves the remaining copy to an originals folder
subprocess.check_output(['mkdir', '-p', basedir + "/originals"], text=True)
for k, v in dict2.items():
index = range(len(v)-1)
for s in index:
subprocess.check_output(['rm', '-f', v[s]], text=True)
print(f"deleting duplicate file {v[s]}")
print(f'Original copy moved from {v[len(v)-1]} to {basedir + "/originals/"}')
subprocess.check_output(['mv', v[len(v)-1], basedir + "/originals/."], text=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment