Skip to content

Instantly share code, notes, and snippets.

@ILPlais
Last active June 10, 2024 11:07
Show Gist options
  • Select an option

  • Save ILPlais/33223c7f27d5cc03fbd32ca620de9e6c to your computer and use it in GitHub Desktop.

Select an option

Save ILPlais/33223c7f27d5cc03fbd32ca620de9e6c to your computer and use it in GitHub Desktop.
Recursively scans a directory and calculates the MD5 sums of files to find duplicates and delete the oldest ones.
#!/bin/python3
import hashlib
import os
import datetime
from tqdm import tqdm # To install tqdm: pip install tqdm
def find_doubles(folder):
"""
Recursively scans a directory and calculates the MD5 sums of files to find duplicates, with a progress bar.
Args:
folder: The path of the directory to be analyzed.
Returns:
A dictionary where the key is the MD5 sum and the value is a list of matching file paths.
"""
files_md5 = {}
print("[*] Counts the total number of files...")
# Compte le nombre total de fichiers
nb_files = sum(1 for _, _, files in os.walk(folder) for file in files)
print(f"[*] {nb_files} files to test.")
with tqdm(total = nb_files, unit = "file") as pbar:
for root, _, files in os.walk(folder):
for file in files:
file_path = os.path.join(root, file)
file_md5 = calculate_md5(file_path)
if file_md5 is not None:
if file_md5 in files_md5:
files_md5[file_md5].append(file_path)
else:
files_md5[file_md5] = [file_path]
# Increments the progress bar
pbar.update(1)
return files_md5
def calculate_md5(file_path):
"""
Calculates the MD5 sum of a file.
Args:
file_path: The path of the file for which to calculate the MD5 sum.
Returns:
The MD5 sum of the file as a hexadecimal string.
"""
hash_md5 = hashlib.md5()
try:
with open(file_path, "rb") as file:
for bloc in iter(lambda: file.read(4096), b""):
hash_md5.update(bloc)
return hash_md5.hexdigest()
except Exception as e:
print(f"[!] Error reading file {file_path}: {e}.")
return None
if __name__ == "__main__":
root_folder = os.getcwd()
print(f"[*] πŸ“‚ Search for duplicates files in the folder: \"{root_folder}\"…")
files_md5 = find_doubles(root_folder)
for md5, files_paths in files_md5.items():
if len(files_paths) > 1:
# Sort files by modification date, from newest to oldest
files_modif_dates = [(file_path, os.path.getmtime(file_path)) for file_path in files_paths]
files_modif_dates.sort(key = lambda x: x[1], reverse = True)
# Keep the most recent file, delete the others
file_to_keep = files_modif_dates[0][0]
files_to_delete = [f[0] for f in files_modif_dates[1:]]
print(f"[*] Duplicate files with MD5 sum \"{md5}\":")
print(f"[+] πŸ“„ {file_to_keep} - Kept.")
for file_path in files_to_delete:
# Delete the file
os.remove(file_path)
print(f"[-] πŸ“„ {file_path} - Deleted.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment