Skip to content

Instantly share code, notes, and snippets.

@MartinThoma
Last active January 16, 2021 12:58
Show Gist options
  • Save MartinThoma/2f7b4d7080f2a618d566bc9d87be06b5 to your computer and use it in GitHub Desktop.
Save MartinThoma/2f7b4d7080f2a618d566bc9d87be06b5 to your computer and use it in GitHub Desktop.
import hashlib
from collections import defaultdict
from pathlib import Path
from typing import List
def find_duplicates(directory: Path) -> List[List[Path]]:
fingerprint2paths = defaultdict(list)
for path in directory.glob("**/*"):
if not path.is_file():
continue
fingerprint = get_fingerprint(path)
fingerprint2paths[fingerprint].append(path)
return [paths for paths in fingerprint2paths.values() if len(paths) > 1]
def get_fingerprint(path: Path) -> str:
hash_md5 = hashlib.md5()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
for duplicate_set in find_duplicates(Path(".")):
print(duplicate_set)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment