Skip to content

Instantly share code, notes, and snippets.

@u1735067
Last active June 14, 2021 12:52
Show Gist options
  • Save u1735067/a5167e2ddbd3749b9f779b09f5c878ff to your computer and use it in GitHub Desktop.
Save u1735067/a5167e2ddbd3749b9f779b09f5c878ff to your computer and use it in GitHub Desktop.
Script to hash image content, avoiding differences in metadatas and others (--[0-9A-F]+-- tag, ..) ; relies on Pillow. Hopefully, Pillow will return the same bytes when EXIF orientation changes and when ICC profile is applied.
#!/usr/bin/env python3
''' BSD 3-Clause License — but if it was useful to you, you may tell me :)
Copyright (c) 2016, Alexandre Levavasseur
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the <organization> nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
'''
import sys, os, argparse, hashlib, datetime, glob
from collections import OrderedDict
from PIL import Image
from Cryptodome.Hash import BLAKE2b # Easiest way on Windows without compiling, sadly :(
'''
Extensions to look for
'''
extensions = ('.png', '.jpg', '.jpeg', '.tif', '.tiff', '.gif', '.bmp')
def main():
# Custom formatter that keeps newlines
class MyFormat(argparse.HelpFormatter):
def _fill_text(self, text, width, indent):
return '\n'.join(map( (lambda line: self.__proceed_paragraph(width, indent, line) ) , text.splitlines()))
def __proceed_paragraph(self, width, indent, text):
import textwrap as _textwrap
text = self._whitespace_matcher.sub(' ', text).strip()
return _textwrap.fill(text, width, initial_indent=indent, subsequent_indent=indent)
#class ArgumentFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
# pass
# https://hg.python.org/cpython/file/3.5/Lib/argparse.py
# https://pymotw.com/3/textwrap/
parser = argparse.ArgumentParser(formatter_class=MyFormat, description=
'Hash image content, avoiding differences due to metadatas and others unwanted stuffs (--[0-9A-F]+-- tag, ..).\n\n'+
'Following file extensions will be searched: \n'+
', '.join(extensions)+'\n\n'+
'Hopefully, Pillow, which this script relies on, will return the same bytes when EXIF orientation changes and when ICC profile is applied. This might change in the future.'
)
parser.add_argument('-r', '--recursive', action='store_true', help='recursivly look for images')
parser.add_argument('-v', '--verify', action='store_true', help='') # TODO
parser.add_argument('-c', '--csv', action='store_true', help='CSV mode : will output a CSV and a log file. Progress is printed every 5%% or 1000 files proceeded.')
parser.add_argument('-s', '--csv-separator', default='|', help='CSV separator to use, defaults to |')
parser.add_argument('-o', '--csv-outfile', type=argparse.FileType('a'), default=None, help='CSV output file, defaults to hashes.csv')
parser.add_argument('-l', '--csv-logfile', type=argparse.FileType('a'), default=None, help='CSV log output mirror (tee)')
parser.add_argument('-q', '--csv-quiet', action='store_true', help='No CSV log at all')
parser.add_argument('-n', '--dry-run', action='store_true', help=argparse.SUPPRESS)
parser.add_argument('-u', '--uniq', type=int, default=3, help=argparse.SUPPRESS)
parser.add_argument('paths', nargs='+', metavar='<FILE or PATH>', help='File or path (content) to hash')
args = parser.parse_args()
# Get file list
files = get_files(args.paths, args.recursive, args.uniq)
if args.dry_run:
from pprint import pprint
pprint(files)
sys.exit(0)
if args.csv:
if not args.csv_outfile:
args.csv_outfile = open('hashes.csv', 'a')
if args.csv_logfile:
sys.stdout = Logger(args.csv_logfile)
print('{} : Starting'.format(horo()))
print('{} : Found {} files'.format(horo(), len(files)))
print('{} : Hashing files ...'.format(horo()))
hash_list(files, args.csv_separator, args.csv_outfile)
args.csv_outfile.close()
else: # "Standalone mode"
for file in files:
hash_file(os.path.join(*file), args.verify)
print('Press enter ..', end='')
input()
'''
Unshamefully stolen code, a better solution might exist
'''
class Logger(object):
def __init__(self, file):
self.terminal = sys.stdout
self.log = file
def write(self, message):
self.terminal.write(message)
self.log.write(message)
def flush(self):
self.terminal.flush()
self.log.flush()
def close(self):
self.terminal.close()
self.log.close()
'''
Simple horodating
'''
def horo():
return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
'''
List all eligible files according to given paths, trying to avoid duplicates in the resulting list
This can be disabled/controlled with -u
'''
def get_files(paths, recurse, uniq=3):
# Construct file list
# Step 1 : filter passed paths
if uniq > 0:
paths = list(OrderedDict.fromkeys(paths)) # Remove easy duplicates
# Step 2 : glob paths & remove duplicates after globbing - could be yield .. but visited path must be kept anyway ..
if uniq > 1:
globbed_paths = OrderedDict() # List of path after globbing
def glob_add(elt):
globbed_paths[elt] = None
def get_glob_list():
return list(globbed_paths)
else:
globbed_paths = []
def glob_add(elt):
globbed_paths.append(elt)
def get_glob_list():
return globbed_paths
for path in paths:
if os.path.exists(path): # Don't glob real path, eg. files with [] in name
glob_add(path)
continue
for globbed_path in glob.glob(path):
glob_add(globbed_path)
globbed_paths = get_glob_list()
# Step 3 : get uniq path content
if uniq > 2:
uniq_paths = OrderedDict()
def add(elt):
uniq_paths[elt] = None
def get_list():
return list(uniq_paths)
else:
flist = []
def add(elt):
flist.append(elt)
def get_list():
return flist
for path in globbed_paths:
if os.path.isdir(path):
if recurse:
for root, dirs, files in os.walk(path):
for file in files:
if file.lower().endswith(extensions):
add((root,file))
else:
for file in os.listdir(path):
if os.path.isfile(os.path.join(path,file)) and file.lower().endswith(extensions):
add((path,file))
else:
add(os.path.split(path))
return get_list()
'''
Generate hash for a given file path
'''
def get_hash(file):
hash = [None, None]
with open(file,'rb') as fh:
with Image.open(fh, mode='r') as im:
hash[0] = BLAKE2b.new(digest_bits=512).update(im.tobytes()).hexdigest()
try:
hash[1] = im._getexif()[0xc71c].hex() # 50972 : Exif.Image.RawImageDigest
except:
pass
return hash
'''
Print hash for a given file path
'''
def hash_file(file, verify=False):
try:
#print('{} {}'.format(hash_file(file), os.path.basename(file)))
hash = get_hash(file)
if verify:
if hash[1] is None:
print('{} : {}'.format(file, 'failed to read exif hash'))
else:
print('{} : {}'.format(file, ('match' if hash[0] == hash[1] else 'mismatch')))
#print('calc {}\nexif {}'.format(hash[0], hash[1] ))
else:
print('{} {}'.format(hash[0], file))
except Exception as e:
print('Failed to hash "{}": {}'.format(file, e), file=sys.stderr)
'''
Output CSV of path, file and hash
'''
def hash_list(flist, sep, outfile):
print('sep={}'.format(sep), file=outfile)
print('Path{0}File{0}Hash'.format(sep), file=outfile)
i = 0
failed = 0
seenp = []
for d,f in flist:
i += 1
curp = int(i/len(flist)*100)
fp = os.path.join(d,f)
try:
h = get_hash(fp)
#h = hashlib.sha256(im.tobytes()).hexdigest()
print('{1}{0}{2}{0}{3}'.format(sep, d, f, h), file=outfile)
if (i % 1000 == 0) or ((curp % 5 == 0) and (curp not in seenp)) :
print('{1} [{2}/{3}%|{4}] : {5}{0}{6}{0}{7}'.format(sep, horo(), i, curp, failed, d, f, h))
seenp.append(curp)
except Exception as e:
failed += 1
print('{} : /!\ File "{}" failed : {}'.format(horo(), fp, e))
print('{} : Proceded {} files, {} failed'.format(horo(), i, failed))
if __name__ == "__main__":
main()
'''
P:\hotos>py -3 imagehash.py test\1*
bef5e7acae400832cae425722a47de3e093f96d6f67bb4e594793f44e61fa61e3a8c76b9af2bc81aab384da902c3c065d384c6d7b56da17fc109c3d256d84ae3 test\1.jpg
bef5e7acae400832cae425722a47de3e093f96d6f67bb4e594793f44e61fa61e3a8c76b9af2bc81aab384da902c3c065d384c6d7b56da17fc109c3d256d84ae3 test\1+exif-orientation.jpg
P:\hotos>py -3 imagehash.py test\2.jpg "test\2+icc.jpg"
ce552bcfd2a2f660d1f1589d9bc489f810cf9d75b55e0f887296ddcad4449936f839b7a6ac641fc7c4f512f6590a7379b2c716b276490ce681b3e3f02f51358f test\2.jpg
ce552bcfd2a2f660d1f1589d9bc489f810cf9d75b55e0f887296ddcad4449936f839b7a6ac641fc7c4f512f6590a7379b2c716b276490ce681b3e3f02f51358f test\2+icc.jpg
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment