Skip to content

Instantly share code, notes, and snippets.

@u8sand
Last active February 24, 2019 19:48
Show Gist options
  • Save u8sand/bcdea43f1d8b89183cdd5dfade58914c to your computer and use it in GitHub Desktop.
Save u8sand/bcdea43f1d8b89183cdd5dfade58914c to your computer and use it in GitHub Desktop.
A python script for dealing with duplicate images
#!/usr/bin/env python3
import os
import sys
import glob
import imagehash
from PIL import Image
from functools import reduce
from itertools import chain
if len(sys.argv) <= 2:
print('Usage: {cmd} [func] [globs ...]'.format(cmd=sys.argv[0]))
print('Example: {cmd} trash-rm "**/*.jpg"'.format(cmd=sys.argv[0]))
sys.exit()
func = sys.argv[1]
globs = sys.argv[2:]
print('Processing globs from command line {globs}'.format(globs=str(globs)), file=sys.stderr)
all_images = chain.from_iterable(map(glob.glob, globs))
image_to_hash = {img: str(imagehash.dhash(Image.open(img))) for img in all_images}
hash_to_image = reduce(
lambda d, kv: dict(d, **{
kv[1]: d[kv[1]] + [kv[0]] if d.get(kv[1]) else [kv[0]]
}),
image_to_hash.items(),
{}
)
hashes_with_duplicates = {k: v for k, v in hash_to_image.items() if len(v) > 1}
for image_hash, duplicate_images in hashes_with_duplicates.items():
sorted_images_by_size = sorted(
zip(
map(os.path.getsize, duplicate_images),
duplicate_images
),
reverse=True,
)
imgs = ' '.join('"{img}"'.format(img=img) for _, img in sorted_images_by_size[1:])
cmd = '{func} {imgs}'.format(func=func, imgs=imgs)
print('duplicates for {image}'.format(image=sorted_images_by_size[0][1]), file=sys.stderr)
print(' {cmd}'.format(cmd=cmd), file=sys.stderr)
os.system(cmd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment