Last active
February 24, 2019 19:48
-
-
Save u8sand/bcdea43f1d8b89183cdd5dfade58914c to your computer and use it in GitHub Desktop.
A python script for dealing with duplicate images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import sys | |
import glob | |
import imagehash | |
from PIL import Image | |
from functools import reduce | |
from itertools import chain | |
if len(sys.argv) <= 2: | |
print('Usage: {cmd} [func] [globs ...]'.format(cmd=sys.argv[0])) | |
print('Example: {cmd} trash-rm "**/*.jpg"'.format(cmd=sys.argv[0])) | |
sys.exit() | |
func = sys.argv[1] | |
globs = sys.argv[2:] | |
print('Processing globs from command line {globs}'.format(globs=str(globs)), file=sys.stderr) | |
all_images = chain.from_iterable(map(glob.glob, globs)) | |
image_to_hash = {img: str(imagehash.dhash(Image.open(img))) for img in all_images} | |
hash_to_image = reduce( | |
lambda d, kv: dict(d, **{ | |
kv[1]: d[kv[1]] + [kv[0]] if d.get(kv[1]) else [kv[0]] | |
}), | |
image_to_hash.items(), | |
{} | |
) | |
hashes_with_duplicates = {k: v for k, v in hash_to_image.items() if len(v) > 1} | |
for image_hash, duplicate_images in hashes_with_duplicates.items(): | |
sorted_images_by_size = sorted( | |
zip( | |
map(os.path.getsize, duplicate_images), | |
duplicate_images | |
), | |
reverse=True, | |
) | |
imgs = ' '.join('"{img}"'.format(img=img) for _, img in sorted_images_by_size[1:]) | |
cmd = '{func} {imgs}'.format(func=func, imgs=imgs) | |
print('duplicates for {image}'.format(image=sorted_images_by_size[0][1]), file=sys.stderr) | |
print(' {cmd}'.format(cmd=cmd), file=sys.stderr) | |
os.system(cmd) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment