isagalaev · December 11, 2015 05:48 · reclosedev · Jan 17, 2013 · isagalaev · Jan 17, 2013
diff --git a/img.py b/img.py
 #!/usr/bin/env python
 # coding: utf-8

 import os
 import random
 import glob

 from PIL import Image
 import numpy


 BLOCK_SIZE = 20
 THRESHOLD = 60
 WIDTH = 200
 MAX_DISTANCE = 220


 def image_data(filename):
    """
    Get data from image ready for comparison
    """
    img = Image.open(filename).resize((BLOCK_SIZE, BLOCK_SIZE), Image.BILINEAR)
    return numpy.array([sum(x) for x in img.getdata()])

 def distance(data1, data2):
    """
    Logical distance between two images on a scale 0..400
    """
    return sum(1 for x in data1 - data2 if abs(x) > THRESHOLD)

 def duplicates(dirname):
    """
    Finds duplicate images in a directory.
    All files must be *.jpg.
    Returns an iterator of image groups ([], [], ... [])
    """
    files = glob.glob(os.path.join(dirname, '*.jpg'))
    images = [(f, image_data(f)) for f in files]
    random.shuffle(images)

    for filename, data in images:
        distances = [(distance(data, d), f) for f, d in images]
        yield sorted([(dist, f) for dist, f in distances if dist < MAX_DISTANCE])

 def html_group(group):
    return ''.join(
        '<img src="%s" width="%s"/>%s' % (os.path.basename(f), WIDTH, dist)
        for dist, f in group
    )

 def html(groups):
    """
    Generates HTML from groups of image duplicates
    """
    body = '<hr/>'.join(html_group(g) for g in groups)
    return '<html><body>%s<hr/></body></html>' % body


 if __name__ == '__main__':
    print(html(duplicates('/home/maniac/Desktop/4554182')))
	#!/usr/bin/env python
	# coding: utf-8

	import os
	import random
	import glob

	from PIL import Image
	import numpy


	BLOCK_SIZE = 20
	THRESHOLD = 60
	WIDTH = 200
	MAX_DISTANCE = 220


	def image_data(filename):
	"""
	Get data from image ready for comparison
	"""
	img = Image.open(filename).resize((BLOCK_SIZE, BLOCK_SIZE), Image.BILINEAR)
	return numpy.array([sum(x) for x in img.getdata()])

	def distance(data1, data2):
	"""
	Logical distance between two images on a scale 0..400
	"""
	return sum(1 for x in data1 - data2 if abs(x) > THRESHOLD)

	def duplicates(dirname):
	"""
	Finds duplicate images in a directory.
	All files must be *.jpg.
	Returns an iterator of image groups ([], [], ... [])
	"""
	files = glob.glob(os.path.join(dirname, '*.jpg'))
	images = [(f, image_data(f)) for f in files]
	random.shuffle(images)

	for filename, data in images:
	distances = [(distance(data, d), f) for f, d in images]
	yield sorted([(dist, f) for dist, f in distances if dist < MAX_DISTANCE])

	def html_group(group):
	return ''.join(
	'<img src="%s" width="%s"/>%s' % (os.path.basename(f), WIDTH, dist)
	for dist, f in group
	)

	def html(groups):
	"""
	Generates HTML from groups of image duplicates
	"""
	body = '<hr/>'.join(html_group(g) for g in groups)
	return '<html><body>%s<hr/></body></html>' % body


	if __name__ == '__main__':
	print(html(duplicates('/home/maniac/Desktop/4554182')))
No results found