Last active
September 26, 2024 18:23
-
-
Save ezirmusitua/1aa47567ad4ebd5679f9e3df09585e17 to your computer and use it in GitHub Desktop.
[Group similar images] Group similar images using phash algorithm #python #image #algorithm #tools
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script use to group similiar images from source folder | |
and save to the same output folder, base on the perceptual hash algorithm. | |
Work on Python3.6 and win10-build-17074 | |
Prerequest: Pillow | |
Author: [email protected] | |
""" | |
import hashlib | |
import mimetypes | |
import os | |
import shutil | |
from PIL import Image | |
Default_Resize_Width = 80 | |
Default_Resize_Height = 80 | |
Same_Image_Value = 600 | |
def get_images_name(dir): | |
""" | |
Get all images filename and file path in given directory | |
:param dir: input directory to get images | |
:return: filenames | |
:rtype: a list of tuple(filename, file_path) | |
""" | |
filenames = list() | |
for f in os.listdir(dir): | |
content_type = mimetypes.guess_type(f) | |
if content_type[0] and not content_type[0].startswith('image'): continue | |
file_path = os.path.join(dir, f) | |
if os.path.isfile(file_path): | |
filenames.append((f, file_path)) | |
return filenames | |
def get_gray_scale_image_data(path, resize_width=Default_Resize_Width, | |
resize_heith=Default_Resize_Height): | |
""" | |
Get gray scale image data in given path | |
:param path: The image source path | |
:param resize_width: After read image resize width | |
:param resize_height: After read image resize height | |
:return: image data | |
:rtype: http://pillow.readthedocs.io/en/latest/reference/Image.html#PIL.Image.Image.getdata | |
""" | |
im = Image.open(path) | |
smaller_image = im.resize((resize_width, resize_heith)) | |
grayscale_image = smaller_image.convert('L') | |
return grayscale_image.getdata() | |
def hash_image(path, resize_width=Default_Resize_Width, | |
resize_heith=Default_Resize_Height): | |
""" | |
Hash given image in path | |
:param path: The image source path | |
:param resize_width: After read image resize width | |
:param resize_height: After read image resize height | |
:return: integer hash value | |
""" | |
hash_string = "" | |
pixels = list(get_gray_scale_image_data(path, resize_width, resize_heith)) | |
for row in range(1, len(pixels) + 1): | |
# compare the grey in same row | |
if row % resize_width: | |
if pixels[row - 1] > pixels[row]: | |
hash_string += '1' | |
else: | |
hash_string += '0' | |
return int(hash_string, 2) | |
def difference_hash(dhash1, dhash2): | |
""" | |
Calculate the difference of two image hash | |
:param: dhash1: image hash value 1 | |
:param: dhash2: image hash value 2 | |
:return: 1 count | |
""" | |
return bin(dhash1 ^ dhash2).count('1') | |
class ImageToGroup(object): | |
""" | |
Image to group class | |
""" | |
def __init__(self, filename, path): | |
# image hash value | |
self.hash = hash_image(path) | |
# image original path | |
self.path = path | |
# image filename | |
self.filename = filename | |
# image path(for saving grouped image) | |
self.hash_path = hashlib.md5(str(self.hash).encode()).hexdigest() | |
# parent image ref | |
self.root = None | |
def is_same_group(self, i2): | |
return difference_hash(self.hash, i2.hash) < Same_Image_Value | |
@property | |
def root_rpath(self): | |
# get image root path, if self is root, get self path | |
if not self.root: return self.hash_path | |
return self.root.hash_path | |
def create_root_dir(self, base): | |
# create directory if not exists | |
if not os.path.exists(os.path.join(base, self.root_rpath)): | |
os.makedirs(os.path.join(base, self.root_rpath)) | |
def copy_to_root_dir(self, base): | |
# copy image from source to root directory | |
print(os.path.join(base, self.root_rpath, self.filename)) | |
shutil.copyfile(self.path, os.path.join(base, self.root_rpath, self.filename)) | |
def copy(self, base): | |
# do copy work | |
self.create_root_dir(base) | |
self.copy_to_root_dir(base) | |
class Groupter(object): | |
""" | |
Do the group work | |
""" | |
def __init__(self, images_to_group): | |
self.images_to_group = images_to_group | |
def group(self): | |
# group all image | |
for i in range(0, len(self.images_to_group)): | |
for j in range(i + 1, len(self.images_to_group)): | |
i1 = self.images_to_group[i] | |
i2 = self.images_to_group[j] | |
if not i1.root and i1.is_same_group(i2): | |
i2.root = i1 | |
return self | |
def dump_group(self, path): | |
# save all grouped images | |
for image in self.images_to_group: | |
image.copy(path) | |
def remove(self, path): | |
for image in self.images_to_group: | |
if not image.root: | |
shutil.copyfile(image.path, os.path.join(path, image.filename)) | |
def main(indir, odir, action='group'): | |
# main entrypoint | |
# first, get all images | |
filenames = get_images_name(indir) | |
# second, using images to create imagesToGroup and initilize groupter | |
groupter = Groupter([ImageToGroup(f[0], f[1]) for f in filenames]) | |
# do group and do save | |
groupter.group() | |
if action == 'group': | |
groupter.dump_group(odir) | |
else: | |
groupter.remove(odir) | |
if __name__ == '__main__': | |
import sys | |
images_dir = sys.argv[1] | |
output_dir = sys.argv[2] | |
action = sys.argv[3] | |
main(images_dir, output_dir, action) |
How to use:
python group-same-image.py "/images/dir" "/grouped/to/save"
python group-same-image.py "/images/dir" "/grouped/to/save" group
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to use: