Last active
July 27, 2023 08:16
-
-
Save cjdd3b/ca9b0ebef0395a5f7793 to your computer and use it in GitHub Desktop.
Example of perceptual hashing for near-duplicate image detection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
cluster.py | |
Uses the Hamming distance between perceptual hashes to surface near-duplicate | |
images. | |
To install and run: | |
1. pip install imagehash | |
2. Put some .dat files in a folder someplace (script assumes ./data/imgs/*.dat) | |
3. python cluster.py | |
You can adjust the HAMMING_THRESH global to be larger if you want near, but not | |
identical, dupes. | |
More information: | |
Hamming distance: https://en.wikipedia.org/wiki/Hamming_distance | |
dhash: http://blog.iconfinder.com/detecting-duplicate-images-using-python/ | |
''' | |
import base64, glob, cStringIO, imagehash, itertools, time | |
from PIL import Image | |
########## GLOBALS ########## | |
# Where all your ads are | |
IMAGE_DIR = './data/imgs/*.dat' | |
# The image you want to test for dupes | |
TEST_IMAGE = './data/test/noname-espn.go.com-2016-02-09T17.05.13-05.00-965774372.dat' | |
# Maximum Hamming distance required to determine a match (0.0 - 1.0) | |
HAMMING_THRESH = 0.0 | |
########## DISTANCE METRICS ########## | |
def hamming(s1, s2): | |
''' | |
Calculate the normalized Hamming distance between two strings. | |
''' | |
assert len(s1) == len(s2) | |
return float(sum(c1 != c2 for c1, c2 in zip(s1, s2))) / float(len(s1)) | |
########## IMAGE PROCESSING ########## | |
def convert_image(imgpath): | |
''' | |
Convert image from base64-encoded string to PIL image object. | |
''' | |
imgstring = open(imgpath, 'r').read() | |
try: | |
decoded = cStringIO.StringIO(base64.b64decode(imgstring.replace('data:image/jpeg;base64,', ''))) | |
return Image.open(decoded) | |
except IOError: | |
print 'Could not decode %s' % imgpath | |
return None | |
def get_image_hashes(path): | |
''' | |
Iterator that returns hashes and Image objects for all images in a directory. | |
''' | |
for f in glob.iglob(path): | |
image = convert_image(f) | |
if image: | |
yield (image, str(imagehash.dhash(image))) | |
########## MAIN ########### | |
if __name__ == '__main__': | |
# Load up a test image to look for dupes against | |
test_image = convert_image(TEST_IMAGE) | |
test_hash = str(imagehash.dhash(test_image)) | |
# Loop over all the ads to look for dupes, base on HAMMING_THRESH | |
for obj in get_image_hashes(IMAGE_DIR): | |
im, h = obj | |
# This will show the other Prius ads because a Hamming distance of 0 means | |
# two things are identical. | |
if hamming(h, test_hash) <= HAMMING_THRESH: | |
im.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment