u1735067 · June 14, 2021 12:52
diff --git a/imagehash.py b/imagehash.py
 #!/usr/bin/env python3

 ''' BSD 3-Clause License — but if it was useful to you, you may tell me :)
 Copyright (c) 2016, Alexandre Levavasseur
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the <organization> nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 '''

 import sys, os, argparse, hashlib, datetime, glob
 from collections import OrderedDict
 from PIL import Image
 from Cryptodome.Hash import BLAKE2b # Easiest way on Windows without compiling, sadly :(

 '''
 Extensions to look for
 '''
 extensions = ('.png', '.jpg', '.jpeg', '.tif', '.tiff', '.gif', '.bmp')

 def main():
 	# Custom formatter that keeps newlines
 	class MyFormat(argparse.HelpFormatter):
 		def _fill_text(self, text, width, indent):
 			return '\n'.join(map( (lambda line: self.__proceed_paragraph(width, indent, line) ) , text.splitlines()))
 		
 		def __proceed_paragraph(self, width, indent, text):
 			import textwrap as _textwrap
 			text = self._whitespace_matcher.sub(' ', text).strip()
 			return _textwrap.fill(text, width, initial_indent=indent, subsequent_indent=indent)
 	
 	#class ArgumentFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
 	#	pass
 	# https://hg.python.org/cpython/file/3.5/Lib/argparse.py
 	# https://pymotw.com/3/textwrap/
 	parser = argparse.ArgumentParser(formatter_class=MyFormat, description=
 		'Hash image content, avoiding differences due to metadatas and others unwanted stuffs (--[0-9A-F]+-- tag, ..).\n\n'+
 		'Following file extensions will be searched: \n'+
 		', '.join(extensions)+'\n\n'+
 		'Hopefully, Pillow, which this script relies on, will return the same bytes when EXIF orientation changes and when ICC profile is applied. This might change in the future.'
 	)
 	parser.add_argument('-r', '--recursive', action='store_true', help='recursivly look for images')
 	parser.add_argument('-v', '--verify', action='store_true', help='') # TODO
 	parser.add_argument('-c', '--csv', action='store_true', help='CSV mode : will output a CSV and a log file. Progress is printed every 5%% or 1000 files proceeded.')
 	parser.add_argument('-s', '--csv-separator', default='|', help='CSV separator to use, defaults to |')
 	parser.add_argument('-o', '--csv-outfile', type=argparse.FileType('a'), default=None, help='CSV output file, defaults to hashes.csv')
 	parser.add_argument('-l', '--csv-logfile', type=argparse.FileType('a'), default=None, help='CSV log output mirror (tee)')
 	parser.add_argument('-q', '--csv-quiet', action='store_true', help='No CSV log at all')
 	parser.add_argument('-n', '--dry-run', action='store_true', help=argparse.SUPPRESS)
 	parser.add_argument('-u', '--uniq', type=int, default=3, help=argparse.SUPPRESS)
 	parser.add_argument('paths', nargs='+', metavar='<FILE or PATH>', help='File or path (content) to hash')
 	args = parser.parse_args()
 	
 	# Get file list
 	files = get_files(args.paths, args.recursive, args.uniq)
 	if args.dry_run:
 		from pprint import pprint
 		pprint(files)
 		sys.exit(0)
 	
 	if args.csv:
 		if not args.csv_outfile:
 			args.csv_outfile = open('hashes.csv', 'a')
 		if args.csv_logfile:
 			sys.stdout = Logger(args.csv_logfile)
 		print('{} : Starting'.format(horo()))
 		print('{} : Found {} files'.format(horo(), len(files)))
 		print('{} : Hashing files ...'.format(horo()))
 		hash_list(files, args.csv_separator, args.csv_outfile)
 		args.csv_outfile.close()
 	else: # "Standalone mode"
 		for file in files:
 			hash_file(os.path.join(*file), args.verify)
 		print('Press enter ..', end='')
 		input()

 '''
 Unshamefully stolen code, a better solution might exist
 '''
 class Logger(object):
 	def __init__(self, file):
 		self.terminal = sys.stdout
 		self.log = file

 	def write(self, message):
 		self.terminal.write(message)
 		self.log.write(message) 
 	
 	def flush(self):
 		self.terminal.flush()
 		self.log.flush()
 	
 	def close(self):
 		self.terminal.close()
 		self.log.close()

 '''
 Simple horodating
 '''
 def horo():
 	return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

 '''
 List all eligible files according to given paths, trying to avoid duplicates in the resulting list
 This can be disabled/controlled with -u
 '''
 def get_files(paths, recurse, uniq=3):
 	# Construct file list
 	# Step 1 : filter passed paths
 	if uniq > 0:
 		paths = list(OrderedDict.fromkeys(paths)) # Remove easy duplicates
 	
 	# Step 2 : glob paths & remove duplicates after globbing - could be yield .. but visited path must be kept anyway ..
 	if uniq > 1:
 		globbed_paths = OrderedDict() # List of path after globbing
 		def glob_add(elt):
 			globbed_paths[elt] = None
 		def get_glob_list():
 			return list(globbed_paths)
 	else:
 		globbed_paths = []
 		def glob_add(elt):
 			globbed_paths.append(elt)
 		def get_glob_list():
 			return globbed_paths
 	for path in paths:
 		if os.path.exists(path): # Don't glob real path, eg. files with [] in name
 			glob_add(path)
 			continue
 		for globbed_path in glob.glob(path):
 			glob_add(globbed_path)
 	globbed_paths = get_glob_list()
 	
 	# Step 3 : get uniq path content
 	if uniq > 2:
 		uniq_paths = OrderedDict()
 		def add(elt):
 			uniq_paths[elt] = None
 		def get_list():
 			return list(uniq_paths)
 	else:
 		flist = []
 		def add(elt):
 			flist.append(elt)
 		def get_list():
 			return flist
 	
 	for path in globbed_paths:
 		if os.path.isdir(path):
 			if recurse:
 				for root, dirs, files in os.walk(path):
 					for file in files:
 						if file.lower().endswith(extensions):
 							add((root,file))
 			else:
 				for file in os.listdir(path):
 					if os.path.isfile(os.path.join(path,file)) and file.lower().endswith(extensions):
 						add((path,file))
 		else:
 			add(os.path.split(path))
 	
 	return get_list()

 '''
 Generate hash for a given file path
 '''
 def get_hash(file):
 	hash = [None, None]
 	with open(file,'rb') as fh:
 		with Image.open(fh, mode='r') as im:
 			hash[0] = BLAKE2b.new(digest_bits=512).update(im.tobytes()).hexdigest()
 			try:
 				hash[1] = im._getexif()[0xc71c].hex() # 50972 : Exif.Image.RawImageDigest
 			except:
 				pass
 	return hash

 '''
 Print hash for a given file path
 '''
 def hash_file(file, verify=False):
 	try:
 		#print('{}  {}'.format(hash_file(file), os.path.basename(file)))
 		hash = get_hash(file)
 		if verify:
 			if hash[1] is None:
 				print('{} : {}'.format(file, 'failed to read exif hash'))
 			else:
 				print('{} : {}'.format(file, ('match' if hash[0] == hash[1] else 'mismatch')))
 			#print('calc {}\nexif {}'.format(hash[0], hash[1] ))
 		else:
 			print('{}  {}'.format(hash[0], file))
 	except Exception as e:
 		print('Failed to hash "{}": {}'.format(file, e), file=sys.stderr)

 '''
 Output CSV of path, file and hash
 '''
 def hash_list(flist, sep, outfile):
 	print('sep={}'.format(sep), file=outfile)
 	print('Path{0}File{0}Hash'.format(sep), file=outfile)
 	i = 0
 	failed = 0
 	seenp = []
 	for d,f in flist:
 		i += 1
 		curp = int(i/len(flist)*100)
 		fp = os.path.join(d,f)
 		try:
 			h = get_hash(fp)
 			#h = hashlib.sha256(im.tobytes()).hexdigest()
 			print('{1}{0}{2}{0}{3}'.format(sep, d, f, h), file=outfile)
 			if (i % 1000 == 0) or ((curp % 5 == 0) and (curp not in seenp)) :
 				print('{1} [{2}/{3}%|{4}] : {5}{0}{6}{0}{7}'.format(sep, horo(), i, curp, failed, d, f, h))
 				seenp.append(curp)
 		except Exception as e:
 			failed += 1
 			print('{} : /!\ File "{}" failed : {}'.format(horo(), fp, e))
 	
 	print('{} : Proceded {} files, {} failed'.format(horo(), i, failed))


 if __name__ == "__main__":
 	main()

 '''
 P:\hotos>py -3 imagehash.py test\1*
 bef5e7acae400832cae425722a47de3e093f96d6f67bb4e594793f44e61fa61e3a8c76b9af2bc81aab384da902c3c065d384c6d7b56da17fc109c3d256d84ae3  test\1.jpg
 bef5e7acae400832cae425722a47de3e093f96d6f67bb4e594793f44e61fa61e3a8c76b9af2bc81aab384da902c3c065d384c6d7b56da17fc109c3d256d84ae3  test\1+exif-orientation.jpg

 P:\hotos>py -3 imagehash.py test\2.jpg "test\2+icc.jpg"
 ce552bcfd2a2f660d1f1589d9bc489f810cf9d75b55e0f887296ddcad4449936f839b7a6ac641fc7c4f512f6590a7379b2c716b276490ce681b3e3f02f51358f  test\2.jpg
 ce552bcfd2a2f660d1f1589d9bc489f810cf9d75b55e0f887296ddcad4449936f839b7a6ac641fc7c4f512f6590a7379b2c716b276490ce681b3e3f02f51358f  test\2+icc.jpg
 '''
	#!/usr/bin/env python3

	''' BSD 3-Clause License — but if it was useful to you, you may tell me :)
	Copyright (c) 2016, Alexandre Levavasseur
	All rights reserved.

	Redistribution and use in source and binary forms, with or without
	modification, are permitted provided that the following conditions are met:
	* Redistributions of source code must retain the above copyright
	notice, this list of conditions and the following disclaimer.
	* Redistributions in binary form must reproduce the above copyright
	notice, this list of conditions and the following disclaimer in the
	documentation and/or other materials provided with the distribution.
	* Neither the name of the <organization> nor the
	names of its contributors may be used to endorse or promote products
	derived from this software without specific prior written permission.

	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
	DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	'''

	import sys, os, argparse, hashlib, datetime, glob
	from collections import OrderedDict
	from PIL import Image
	from Cryptodome.Hash import BLAKE2b # Easiest way on Windows without compiling, sadly :(

	'''
	Extensions to look for
	'''
	extensions = ('.png', '.jpg', '.jpeg', '.tif', '.tiff', '.gif', '.bmp')

	def main():
	# Custom formatter that keeps newlines
	class MyFormat(argparse.HelpFormatter):
	def _fill_text(self, text, width, indent):
	return '\n'.join(map( (lambda line: self.__proceed_paragraph(width, indent, line) ) , text.splitlines()))

	def __proceed_paragraph(self, width, indent, text):
	import textwrap as _textwrap
	text = self._whitespace_matcher.sub(' ', text).strip()
	return _textwrap.fill(text, width, initial_indent=indent, subsequent_indent=indent)

	#class ArgumentFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
	# pass
	# https://hg.python.org/cpython/file/3.5/Lib/argparse.py
	# https://pymotw.com/3/textwrap/
	parser = argparse.ArgumentParser(formatter_class=MyFormat, description=
	'Hash image content, avoiding differences due to metadatas and others unwanted stuffs (--[0-9A-F]+-- tag, ..).\n\n'+
	'Following file extensions will be searched: \n'+
	', '.join(extensions)+'\n\n'+
	'Hopefully, Pillow, which this script relies on, will return the same bytes when EXIF orientation changes and when ICC profile is applied. This might change in the future.'
	)
	parser.add_argument('-r', '--recursive', action='store_true', help='recursivly look for images')
	parser.add_argument('-v', '--verify', action='store_true', help='') # TODO
	parser.add_argument('-c', '--csv', action='store_true', help='CSV mode : will output a CSV and a log file. Progress is printed every 5%% or 1000 files proceeded.')
	parser.add_argument('-s', '--csv-separator', default='\|', help='CSV separator to use, defaults to \|')
	parser.add_argument('-o', '--csv-outfile', type=argparse.FileType('a'), default=None, help='CSV output file, defaults to hashes.csv')
	parser.add_argument('-l', '--csv-logfile', type=argparse.FileType('a'), default=None, help='CSV log output mirror (tee)')
	parser.add_argument('-q', '--csv-quiet', action='store_true', help='No CSV log at all')
	parser.add_argument('-n', '--dry-run', action='store_true', help=argparse.SUPPRESS)
	parser.add_argument('-u', '--uniq', type=int, default=3, help=argparse.SUPPRESS)
	parser.add_argument('paths', nargs='+', metavar='<FILE or PATH>', help='File or path (content) to hash')
	args = parser.parse_args()

	# Get file list
	files = get_files(args.paths, args.recursive, args.uniq)
	if args.dry_run:
	from pprint import pprint
	pprint(files)
	sys.exit(0)

	if args.csv:
	if not args.csv_outfile:
	args.csv_outfile = open('hashes.csv', 'a')
	if args.csv_logfile:
	sys.stdout = Logger(args.csv_logfile)
	print('{} : Starting'.format(horo()))
	print('{} : Found {} files'.format(horo(), len(files)))
	print('{} : Hashing files ...'.format(horo()))
	hash_list(files, args.csv_separator, args.csv_outfile)
	args.csv_outfile.close()
	else: # "Standalone mode"
	for file in files:
	hash_file(os.path.join(*file), args.verify)
	print('Press enter ..', end='')
	input()

	'''
	Unshamefully stolen code, a better solution might exist
	'''
	class Logger(object):
	def __init__(self, file):
	self.terminal = sys.stdout
	self.log = file

	def write(self, message):
	self.terminal.write(message)
	self.log.write(message)

	def flush(self):
	self.terminal.flush()
	self.log.flush()

	def close(self):
	self.terminal.close()
	self.log.close()

	'''
	Simple horodating
	'''
	def horo():
	return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

	'''
	List all eligible files according to given paths, trying to avoid duplicates in the resulting list
	This can be disabled/controlled with -u
	'''
	def get_files(paths, recurse, uniq=3):
	# Construct file list
	# Step 1 : filter passed paths
	if uniq > 0:
	paths = list(OrderedDict.fromkeys(paths)) # Remove easy duplicates

	# Step 2 : glob paths & remove duplicates after globbing - could be yield .. but visited path must be kept anyway ..
	if uniq > 1:
	globbed_paths = OrderedDict() # List of path after globbing
	def glob_add(elt):
	globbed_paths[elt] = None
	def get_glob_list():
	return list(globbed_paths)
	else:
	globbed_paths = []
	def glob_add(elt):
	globbed_paths.append(elt)
	def get_glob_list():
	return globbed_paths
	for path in paths:
	if os.path.exists(path): # Don't glob real path, eg. files with [] in name
	glob_add(path)
	continue
	for globbed_path in glob.glob(path):
	glob_add(globbed_path)
	globbed_paths = get_glob_list()

	# Step 3 : get uniq path content
	if uniq > 2:
	uniq_paths = OrderedDict()
	def add(elt):
	uniq_paths[elt] = None
	def get_list():
	return list(uniq_paths)
	else:
	flist = []
	def add(elt):
	flist.append(elt)
	def get_list():
	return flist

	for path in globbed_paths:
	if os.path.isdir(path):
	if recurse:
	for root, dirs, files in os.walk(path):
	for file in files:
	if file.lower().endswith(extensions):
	add((root,file))
	else:
	for file in os.listdir(path):
	if os.path.isfile(os.path.join(path,file)) and file.lower().endswith(extensions):
	add((path,file))
	else:
	add(os.path.split(path))

	return get_list()

	'''
	Generate hash for a given file path
	'''
	def get_hash(file):
	hash = [None, None]
	with open(file,'rb') as fh:
	with Image.open(fh, mode='r') as im:
	hash[0] = BLAKE2b.new(digest_bits=512).update(im.tobytes()).hexdigest()
	try:
	hash[1] = im._getexif()[0xc71c].hex() # 50972 : Exif.Image.RawImageDigest
	except:
	pass
	return hash

	'''
	Print hash for a given file path
	'''
	def hash_file(file, verify=False):
	try:
	#print('{} {}'.format(hash_file(file), os.path.basename(file)))
	hash = get_hash(file)
	if verify:
	if hash[1] is None:
	print('{} : {}'.format(file, 'failed to read exif hash'))
	else:
	print('{} : {}'.format(file, ('match' if hash[0] == hash[1] else 'mismatch')))
	#print('calc {}\nexif {}'.format(hash[0], hash[1] ))
	else:
	print('{} {}'.format(hash[0], file))
	except Exception as e:
	print('Failed to hash "{}": {}'.format(file, e), file=sys.stderr)

	'''
	Output CSV of path, file and hash
	'''
	def hash_list(flist, sep, outfile):
	print('sep={}'.format(sep), file=outfile)
	print('Path{0}File{0}Hash'.format(sep), file=outfile)
	i = 0
	failed = 0
	seenp = []
	for d,f in flist:
	i += 1
	curp = int(i/len(flist)*100)
	fp = os.path.join(d,f)
	try:
	h = get_hash(fp)
	#h = hashlib.sha256(im.tobytes()).hexdigest()
	print('{1}{0}{2}{0}{3}'.format(sep, d, f, h), file=outfile)
	if (i % 1000 == 0) or ((curp % 5 == 0) and (curp not in seenp)) :
	print('{1} [{2}/{3}%\|{4}] : {5}{0}{6}{0}{7}'.format(sep, horo(), i, curp, failed, d, f, h))
	seenp.append(curp)
	except Exception as e:
	failed += 1
	print('{} : /!\ File "{}" failed : {}'.format(horo(), fp, e))

	print('{} : Proceded {} files, {} failed'.format(horo(), i, failed))


	if __name__ == "__main__":
	main()

	'''
	P:\hotos>py -3 imagehash.py test\1*
	bef5e7acae400832cae425722a47de3e093f96d6f67bb4e594793f44e61fa61e3a8c76b9af2bc81aab384da902c3c065d384c6d7b56da17fc109c3d256d84ae3 test\1.jpg
	bef5e7acae400832cae425722a47de3e093f96d6f67bb4e594793f44e61fa61e3a8c76b9af2bc81aab384da902c3c065d384c6d7b56da17fc109c3d256d84ae3 test\1+exif-orientation.jpg

	P:\hotos>py -3 imagehash.py test\2.jpg "test\2+icc.jpg"
	ce552bcfd2a2f660d1f1589d9bc489f810cf9d75b55e0f887296ddcad4449936f839b7a6ac641fc7c4f512f6590a7379b2c716b276490ce681b3e3f02f51358f test\2.jpg
	ce552bcfd2a2f660d1f1589d9bc489f810cf9d75b55e0f887296ddcad4449936f839b7a6ac641fc7c4f512f6590a7379b2c716b276490ce681b3e3f02f51358f test\2+icc.jpg
	'''