Last active
June 10, 2024 11:07
-
-
Save ILPlais/33223c7f27d5cc03fbd32ca620de9e6c to your computer and use it in GitHub Desktop.
Recursively scans a directory and calculates the MD5 sums of files to find duplicates and delete the oldest ones.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/python3 | |
| import hashlib | |
| import os | |
| import datetime | |
| from tqdm import tqdm # To install tqdm: pip install tqdm | |
| def find_doubles(folder): | |
| """ | |
| Recursively scans a directory and calculates the MD5 sums of files to find duplicates, with a progress bar. | |
| Args: | |
| folder: The path of the directory to be analyzed. | |
| Returns: | |
| A dictionary where the key is the MD5 sum and the value is a list of matching file paths. | |
| """ | |
| files_md5 = {} | |
| print("[*] Counts the total number of files...") | |
| # Compte le nombre total de fichiers | |
| nb_files = sum(1 for _, _, files in os.walk(folder) for file in files) | |
| print(f"[*] {nb_files} files to test.") | |
| with tqdm(total = nb_files, unit = "file") as pbar: | |
| for root, _, files in os.walk(folder): | |
| for file in files: | |
| file_path = os.path.join(root, file) | |
| file_md5 = calculate_md5(file_path) | |
| if file_md5 is not None: | |
| if file_md5 in files_md5: | |
| files_md5[file_md5].append(file_path) | |
| else: | |
| files_md5[file_md5] = [file_path] | |
| # Increments the progress bar | |
| pbar.update(1) | |
| return files_md5 | |
| def calculate_md5(file_path): | |
| """ | |
| Calculates the MD5 sum of a file. | |
| Args: | |
| file_path: The path of the file for which to calculate the MD5 sum. | |
| Returns: | |
| The MD5 sum of the file as a hexadecimal string. | |
| """ | |
| hash_md5 = hashlib.md5() | |
| try: | |
| with open(file_path, "rb") as file: | |
| for bloc in iter(lambda: file.read(4096), b""): | |
| hash_md5.update(bloc) | |
| return hash_md5.hexdigest() | |
| except Exception as e: | |
| print(f"[!] Error reading file {file_path}: {e}.") | |
| return None | |
| if __name__ == "__main__": | |
| root_folder = os.getcwd() | |
| print(f"[*] π Search for duplicates files in the folder: \"{root_folder}\"β¦") | |
| files_md5 = find_doubles(root_folder) | |
| for md5, files_paths in files_md5.items(): | |
| if len(files_paths) > 1: | |
| # Sort files by modification date, from newest to oldest | |
| files_modif_dates = [(file_path, os.path.getmtime(file_path)) for file_path in files_paths] | |
| files_modif_dates.sort(key = lambda x: x[1], reverse = True) | |
| # Keep the most recent file, delete the others | |
| file_to_keep = files_modif_dates[0][0] | |
| files_to_delete = [f[0] for f in files_modif_dates[1:]] | |
| print(f"[*] Duplicate files with MD5 sum \"{md5}\":") | |
| print(f"[+] π {file_to_keep} - Kept.") | |
| for file_path in files_to_delete: | |
| # Delete the file | |
| os.remove(file_path) | |
| print(f"[-] π {file_path} - Deleted.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment