Last active
May 27, 2024 21:10
-
-
Save diego021/f487214de18d235dd45d83150b50bb6c to your computer and use it in GitHub Desktop.
Deduplicate files recursively on a given directory.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Scan recursively a given directory and find duplicates files within it. | |
You can use this module as is by invoking it from the command shell: | |
~$ python3 deduplicate.py /path/to/scan | |
""" | |
import dataclasses | |
import hashlib | |
import os | |
@dataclasses.dataclass(frozen=True) | |
class FileChecksum: | |
md5checksum: str | |
path: str | |
def __eq__(self, other) -> bool: | |
if isinstance(other, FileChecksum): | |
return self.md5checksum == other.md5checksum | |
return False | |
def __hash__(self): | |
return hash(self.md5checksum) | |
@dataclasses.dataclass | |
class ScanResult: | |
visited: list[str] = dataclasses.field(default_factory=list) | |
unique: set[FileChecksum] = dataclasses.field(default_factory=set) | |
duplicated: list[FileChecksum] = dataclasses.field(default_factory=list) | |
def __str__(self): | |
return ( | |
f'Total visited: {len(self.visited)}, ' | |
f'Unique: {len(self.unique)}, ' | |
f'Duplicated: {len(self.duplicated)}' | |
) | |
def _compute_hash(path: str) -> str: | |
bdata = open(path, 'rb').read() | |
hash_ = hashlib.md5(bdata) | |
return hash_.hexdigest() | |
def scan_path(path: str) -> ScanResult: | |
result = ScanResult() | |
for root, dirs, files in os.walk(path): | |
for file_ in files: | |
abspath = os.path.join(root, file_) | |
f = FileChecksum( | |
md5checksum=_compute_hash(path=abspath), | |
path=abspath, | |
) | |
if f in result.unique: | |
result.duplicated.append(f) | |
result.unique.add(f) | |
result.visited.append(f.path) | |
assert len(result.visited) == len(result.duplicated) + len(result.unique) | |
return result | |
def remove_duplicates(result: ScanResult) -> None: | |
response = None | |
for f in result.duplicated: | |
if response and response.upper() == 'ALL': | |
os.remove(f.path) | |
else: | |
uf = next(u for u in result.unique if u == f) | |
dpath = f'\033[1;31m{f.path}\033[0m' | |
upath = f'\033[1;32m{uf.path}\033[0m' | |
response = input(f'[y/N/all] Remove {dpath} with unique reference on {upath}? ') | |
if response.upper() in ('Y', 'YES', 'ALL'): | |
os.remove(f.path) | |
if __name__ == '__main__': | |
import sys | |
path = sys.argv[1].rstrip('/') | |
print(f'Scaning duplicates in path \033[1m{path}\033[0m ...') | |
print() | |
scan_result = scan_path(path) | |
print(scan_result) | |
print() | |
remove_duplicates(scan_result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment