Last active
October 22, 2018 11:17
-
-
Save verhovsky/ad4d401fed0d088bd342a6a3324ae219 to your computer and use it in GitHub Desktop.
a script that uses perceptual hashing (pHash) to find all the duplicated images in a directory and delete them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
find all the duplicated images in the current directory using perceptual hashing and delete them | |
to install dependencies | |
$ pip install imagehash Pillow | |
usage: ./delete_duplicate_images.py [directory containing the images] | |
''' | |
import sys | |
from pathlib import Path | |
from imagehash import phash | |
from PIL import Image | |
directory = Path(sys.argv[1] if len(sys.argv) > 1 else ".") | |
seen_hashes = {} | |
for filename in directory.iterdir(): | |
try: | |
hash_ = phash(Image.open(filename)) | |
if hash_ in seen_hashes: | |
print('deleting {}, it\'s the same as {}'.format(filename, seen_hashes[hash_])) | |
filename.unlink() | |
else: | |
seen_hashes[hash_] = filename | |
except OSError: | |
continue # skip files that can't be opened as images | |
else: | |
print('no images found in {}'.format(directory.absolute())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment