Created
April 22, 2020 06:41
-
-
Save asigatchov/210880cb92b784dff0e245e27785c41f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# USAGE | |
# python detect_and_remove.py --dataset dataset | |
# python detect_and_remove.py --dataset dataset --remove 1 | |
# import the necessary packages | |
from imutils import paths | |
import numpy as np | |
import argparse | |
import cv2 | |
import os | |
def dhash(image, hashSize=8): | |
# convert the image to grayscale and resize the grayscale image, | |
# adding a single column (width) so we can compute the horizontal | |
# gradient | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
resized = cv2.resize(gray, (hashSize + 1, hashSize)) | |
# compute the (relative) horizontal gradient between adjacent | |
# column pixels | |
diff = resized[:, 1:] > resized[:, :-1] | |
# convert the difference image to a hash and return it | |
return sum([2 ** i for (i, v) in enumerate(diff.flatten()) if v]) | |
# construct the argument parser and parse the arguments | |
ap = argparse.ArgumentParser() | |
ap.add_argument("-d", "--dataset", required=True, | |
help="path to input dataset") | |
ap.add_argument("-r", "--remove", type=int, default=-1, | |
help="whether or not duplicates should be removed (i.e., dry run)") | |
args = vars(ap.parse_args()) | |
# grab the paths to all images in our input dataset directory and | |
# then initialize our hashes dictionary | |
print("[INFO] computing image hashes...") | |
imagePaths = list(paths.list_images(args["dataset"])) | |
hashes = {} | |
i = 0 | |
# loop over our image paths | |
for imagePath in imagePaths: | |
i += 1 | |
# load the input image and compute the hash | |
image = cv2.imread(imagePath) | |
h = dhash(image) | |
print(h) | |
cv2.imshow("Montage", image) | |
cv2.waitKey(0) | |
# grab all image paths with that hash, add the current image | |
# path to it, and store the list back in the hashes dictionary | |
p = hashes.get(h, []) | |
p.append(imagePath) | |
hashes[h] = p | |
if i % 1000 == 0: | |
print("readed={}".format(i)) | |
# loop over the image hashes | |
for (h, hashedPaths) in hashes.items(): | |
# check to see if there is more than one image with the same hash | |
if len(hashedPaths) > 1: | |
# check to see if this is a dry run | |
if args["remove"] <= 0: | |
# initialize a montage to store all images with the same | |
# hash | |
montage = None | |
# loop over all image paths with the same hash | |
for p in hashedPaths: | |
# load the input image and resize it to a fixed width | |
# and height | |
image = cv2.imread(p) | |
image = cv2.resize(image, (150, 150)) | |
# if our montage is None, initialize it | |
if montage is None: | |
montage = image | |
# otherwise, horizontally stack the images | |
else: | |
montage = np.hstack([montage, image]) | |
# show the montage for the hash | |
print("[INFO] hash: {}".format(h)) | |
cv2.imshow("Montage", montage) | |
cv2.waitKey(0) | |
# otherwise, we'll be removing the duplicate images | |
else: | |
# loop over all image paths with the same hash *except* | |
# for the first image in the list (since we want to keep | |
# one, and only one, of the duplicate images) | |
for p in hashedPaths[1:]: | |
os.remove(p) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment