badjano · November 25, 2020 18:06
diff --git a/img_duplicates.py b/img_duplicates.py
 import hashlib
 from glob import glob
 import os
 import cv2
 import subprocess


 def md5_for_file(path):
    with open(path, 'rb') as f:
        md5 = hashlib.md5()
        md5.update(f.read())
        return md5.digest()


 folder_pattern = "datasets\\*\\*\\*.png"
 size_map = {}
 files = glob(folder_pattern)
 last_index = len(files) - 1
 for index, path in enumerate(files):
    size = os.path.getsize(path)
    image = cv2.imread(path)
    hist = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist = cv2.normalize(hist, hist).flatten()
    h = md5_for_file(path)
    size_map[path] = (size, hist, h)
    if index % int(last_index / 300) == 0 or index == last_index:
        msg = "\rReading files and caching data for comparison: %.02f%%" % (index * 100 / last_index)
        print(msg, flush=True, end='' if index < last_index else "\n")

 size_map = {k: v for k, v in sorted(size_map.items(), key=lambda item: -item[1][0])}

 last = ()
 duplicate = []
 all_sims = {}


 def show_image(path):
    subprocess.call(f"start {path}", shell=True)


 for i, key in enumerate(size_map):
    size = size_map[key][0]
    hist = size_map[key][1]
    md5 = size_map[key][2]
    if key not in duplicate:
        if last and size == last[1] and md5 == last[3]:
            duplicate.append(key)
        else:
            for j, k in enumerate(size_map):
                if i != j and k not in duplicate:
                    dif_key = "%s %s" % tuple(sorted([key, k]))
                    if dif_key not in all_sims:
                        similarity = all_sims[dif_key] = cv2.compareHist(hist, size_map[k][1], cv2.HISTCMP_CORREL)
                        if similarity > .998:  # threshold
                            duplicate.append(k)

            last = (key, size, hist, md5)
    if i % int(last_index / 300) == 0 or i == last_index:
        msg = "\rComparing files: %.02f%%" % (i * 100 / last_index)
        print(msg, flush=True, end='' if i < last_index else "\n")

 remove_folder = "duplicates"
 if not os.path.exists(remove_folder):
    os.makedirs(remove_folder)

 for i, file in enumerate(duplicate):
    os.rename(file, f"{remove_folder}\\img_{i}.{file.split('.')[1]}")
	import hashlib
	from glob import glob
	import os
	import cv2
	import subprocess


	def md5_for_file(path):
	with open(path, 'rb') as f:
	md5 = hashlib.md5()
	md5.update(f.read())
	return md5.digest()


	folder_pattern = "datasets\\\\\\*.png"
	size_map = {}
	files = glob(folder_pattern)
	last_index = len(files) - 1
	for index, path in enumerate(files):
	size = os.path.getsize(path)
	image = cv2.imread(path)
	hist = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
	hist = cv2.normalize(hist, hist).flatten()
	h = md5_for_file(path)
	size_map[path] = (size, hist, h)
	if index % int(last_index / 300) == 0 or index == last_index:
	msg = "\rReading files and caching data for comparison: %.02f%%" % (index * 100 / last_index)
	print(msg, flush=True, end='' if index < last_index else "\n")

	size_map = {k: v for k, v in sorted(size_map.items(), key=lambda item: -item[1][0])}

	last = ()
	duplicate = []
	all_sims = {}


	def show_image(path):
	subprocess.call(f"start {path}", shell=True)


	for i, key in enumerate(size_map):
	size = size_map[key][0]
	hist = size_map[key][1]
	md5 = size_map[key][2]
	if key not in duplicate:
	if last and size == last[1] and md5 == last[3]:
	duplicate.append(key)
	else:
	for j, k in enumerate(size_map):
	if i != j and k not in duplicate:
	dif_key = "%s %s" % tuple(sorted([key, k]))
	if dif_key not in all_sims:
	similarity = all_sims[dif_key] = cv2.compareHist(hist, size_map[k][1], cv2.HISTCMP_CORREL)
	if similarity > .998: # threshold
	duplicate.append(k)

	last = (key, size, hist, md5)
	if i % int(last_index / 300) == 0 or i == last_index:
	msg = "\rComparing files: %.02f%%" % (i * 100 / last_index)
	print(msg, flush=True, end='' if i < last_index else "\n")

	remove_folder = "duplicates"
	if not os.path.exists(remove_folder):
	os.makedirs(remove_folder)

	for i, file in enumerate(duplicate):
	os.rename(file, f"{remove_folder}\\img_{i}.{file.split('.')[1]}")