Last active
November 25, 2020 18:06
-
-
Save badjano/0723e742bd94a68cc2479717f79653a0 to your computer and use it in GitHub Desktop.
Search for all image duplicates in folders and subfolders, preserving the one with larger size
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
from glob import glob | |
import os | |
import cv2 | |
import subprocess | |
def md5_for_file(path): | |
with open(path, 'rb') as f: | |
md5 = hashlib.md5() | |
md5.update(f.read()) | |
return md5.digest() | |
folder_pattern = "datasets\\*\\*\\*.png" | |
size_map = {} | |
files = glob(folder_pattern) | |
last_index = len(files) - 1 | |
for index, path in enumerate(files): | |
size = os.path.getsize(path) | |
image = cv2.imread(path) | |
hist = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]) | |
hist = cv2.normalize(hist, hist).flatten() | |
h = md5_for_file(path) | |
size_map[path] = (size, hist, h) | |
if index % int(last_index / 300) == 0 or index == last_index: | |
msg = "\rReading files and caching data for comparison: %.02f%%" % (index * 100 / last_index) | |
print(msg, flush=True, end='' if index < last_index else "\n") | |
size_map = {k: v for k, v in sorted(size_map.items(), key=lambda item: -item[1][0])} | |
last = () | |
duplicate = [] | |
all_sims = {} | |
def show_image(path): | |
subprocess.call(f"start {path}", shell=True) | |
for i, key in enumerate(size_map): | |
size = size_map[key][0] | |
hist = size_map[key][1] | |
md5 = size_map[key][2] | |
if key not in duplicate: | |
if last and size == last[1] and md5 == last[3]: | |
duplicate.append(key) | |
else: | |
for j, k in enumerate(size_map): | |
if i != j and k not in duplicate: | |
dif_key = "%s %s" % tuple(sorted([key, k])) | |
if dif_key not in all_sims: | |
similarity = all_sims[dif_key] = cv2.compareHist(hist, size_map[k][1], cv2.HISTCMP_CORREL) | |
if similarity > .998: # threshold | |
duplicate.append(k) | |
last = (key, size, hist, md5) | |
if i % int(last_index / 300) == 0 or i == last_index: | |
msg = "\rComparing files: %.02f%%" % (i * 100 / last_index) | |
print(msg, flush=True, end='' if i < last_index else "\n") | |
remove_folder = "duplicates" | |
if not os.path.exists(remove_folder): | |
os.makedirs(remove_folder) | |
for i, file in enumerate(duplicate): | |
os.rename(file, f"{remove_folder}\\img_{i}.{file.split('.')[1]}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment