Last active
June 14, 2021 12:52
-
-
Save u1735067/a5167e2ddbd3749b9f779b09f5c878ff to your computer and use it in GitHub Desktop.
Script to hash image content, avoiding differences in metadatas and others (--[0-9A-F]+-- tag, ..) ; relies on Pillow. Hopefully, Pillow will return the same bytes when EXIF orientation changes and when ICC profile is applied.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' BSD 3-Clause License — but if it was useful to you, you may tell me :) | |
Copyright (c) 2016, Alexandre Levavasseur | |
All rights reserved. | |
Redistribution and use in source and binary forms, with or without | |
modification, are permitted provided that the following conditions are met: | |
* Redistributions of source code must retain the above copyright | |
notice, this list of conditions and the following disclaimer. | |
* Redistributions in binary form must reproduce the above copyright | |
notice, this list of conditions and the following disclaimer in the | |
documentation and/or other materials provided with the distribution. | |
* Neither the name of the <organization> nor the | |
names of its contributors may be used to endorse or promote products | |
derived from this software without specific prior written permission. | |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY | |
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
''' | |
import sys, os, argparse, hashlib, datetime, glob | |
from collections import OrderedDict | |
from PIL import Image | |
from Cryptodome.Hash import BLAKE2b # Easiest way on Windows without compiling, sadly :( | |
''' | |
Extensions to look for | |
''' | |
extensions = ('.png', '.jpg', '.jpeg', '.tif', '.tiff', '.gif', '.bmp') | |
def main(): | |
# Custom formatter that keeps newlines | |
class MyFormat(argparse.HelpFormatter): | |
def _fill_text(self, text, width, indent): | |
return '\n'.join(map( (lambda line: self.__proceed_paragraph(width, indent, line) ) , text.splitlines())) | |
def __proceed_paragraph(self, width, indent, text): | |
import textwrap as _textwrap | |
text = self._whitespace_matcher.sub(' ', text).strip() | |
return _textwrap.fill(text, width, initial_indent=indent, subsequent_indent=indent) | |
#class ArgumentFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): | |
# pass | |
# https://hg.python.org/cpython/file/3.5/Lib/argparse.py | |
# https://pymotw.com/3/textwrap/ | |
parser = argparse.ArgumentParser(formatter_class=MyFormat, description= | |
'Hash image content, avoiding differences due to metadatas and others unwanted stuffs (--[0-9A-F]+-- tag, ..).\n\n'+ | |
'Following file extensions will be searched: \n'+ | |
', '.join(extensions)+'\n\n'+ | |
'Hopefully, Pillow, which this script relies on, will return the same bytes when EXIF orientation changes and when ICC profile is applied. This might change in the future.' | |
) | |
parser.add_argument('-r', '--recursive', action='store_true', help='recursivly look for images') | |
parser.add_argument('-v', '--verify', action='store_true', help='') # TODO | |
parser.add_argument('-c', '--csv', action='store_true', help='CSV mode : will output a CSV and a log file. Progress is printed every 5%% or 1000 files proceeded.') | |
parser.add_argument('-s', '--csv-separator', default='|', help='CSV separator to use, defaults to |') | |
parser.add_argument('-o', '--csv-outfile', type=argparse.FileType('a'), default=None, help='CSV output file, defaults to hashes.csv') | |
parser.add_argument('-l', '--csv-logfile', type=argparse.FileType('a'), default=None, help='CSV log output mirror (tee)') | |
parser.add_argument('-q', '--csv-quiet', action='store_true', help='No CSV log at all') | |
parser.add_argument('-n', '--dry-run', action='store_true', help=argparse.SUPPRESS) | |
parser.add_argument('-u', '--uniq', type=int, default=3, help=argparse.SUPPRESS) | |
parser.add_argument('paths', nargs='+', metavar='<FILE or PATH>', help='File or path (content) to hash') | |
args = parser.parse_args() | |
# Get file list | |
files = get_files(args.paths, args.recursive, args.uniq) | |
if args.dry_run: | |
from pprint import pprint | |
pprint(files) | |
sys.exit(0) | |
if args.csv: | |
if not args.csv_outfile: | |
args.csv_outfile = open('hashes.csv', 'a') | |
if args.csv_logfile: | |
sys.stdout = Logger(args.csv_logfile) | |
print('{} : Starting'.format(horo())) | |
print('{} : Found {} files'.format(horo(), len(files))) | |
print('{} : Hashing files ...'.format(horo())) | |
hash_list(files, args.csv_separator, args.csv_outfile) | |
args.csv_outfile.close() | |
else: # "Standalone mode" | |
for file in files: | |
hash_file(os.path.join(*file), args.verify) | |
print('Press enter ..', end='') | |
input() | |
''' | |
Unshamefully stolen code, a better solution might exist | |
''' | |
class Logger(object): | |
def __init__(self, file): | |
self.terminal = sys.stdout | |
self.log = file | |
def write(self, message): | |
self.terminal.write(message) | |
self.log.write(message) | |
def flush(self): | |
self.terminal.flush() | |
self.log.flush() | |
def close(self): | |
self.terminal.close() | |
self.log.close() | |
''' | |
Simple horodating | |
''' | |
def horo(): | |
return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') | |
''' | |
List all eligible files according to given paths, trying to avoid duplicates in the resulting list | |
This can be disabled/controlled with -u | |
''' | |
def get_files(paths, recurse, uniq=3): | |
# Construct file list | |
# Step 1 : filter passed paths | |
if uniq > 0: | |
paths = list(OrderedDict.fromkeys(paths)) # Remove easy duplicates | |
# Step 2 : glob paths & remove duplicates after globbing - could be yield .. but visited path must be kept anyway .. | |
if uniq > 1: | |
globbed_paths = OrderedDict() # List of path after globbing | |
def glob_add(elt): | |
globbed_paths[elt] = None | |
def get_glob_list(): | |
return list(globbed_paths) | |
else: | |
globbed_paths = [] | |
def glob_add(elt): | |
globbed_paths.append(elt) | |
def get_glob_list(): | |
return globbed_paths | |
for path in paths: | |
if os.path.exists(path): # Don't glob real path, eg. files with [] in name | |
glob_add(path) | |
continue | |
for globbed_path in glob.glob(path): | |
glob_add(globbed_path) | |
globbed_paths = get_glob_list() | |
# Step 3 : get uniq path content | |
if uniq > 2: | |
uniq_paths = OrderedDict() | |
def add(elt): | |
uniq_paths[elt] = None | |
def get_list(): | |
return list(uniq_paths) | |
else: | |
flist = [] | |
def add(elt): | |
flist.append(elt) | |
def get_list(): | |
return flist | |
for path in globbed_paths: | |
if os.path.isdir(path): | |
if recurse: | |
for root, dirs, files in os.walk(path): | |
for file in files: | |
if file.lower().endswith(extensions): | |
add((root,file)) | |
else: | |
for file in os.listdir(path): | |
if os.path.isfile(os.path.join(path,file)) and file.lower().endswith(extensions): | |
add((path,file)) | |
else: | |
add(os.path.split(path)) | |
return get_list() | |
''' | |
Generate hash for a given file path | |
''' | |
def get_hash(file): | |
hash = [None, None] | |
with open(file,'rb') as fh: | |
with Image.open(fh, mode='r') as im: | |
hash[0] = BLAKE2b.new(digest_bits=512).update(im.tobytes()).hexdigest() | |
try: | |
hash[1] = im._getexif()[0xc71c].hex() # 50972 : Exif.Image.RawImageDigest | |
except: | |
pass | |
return hash | |
''' | |
Print hash for a given file path | |
''' | |
def hash_file(file, verify=False): | |
try: | |
#print('{} {}'.format(hash_file(file), os.path.basename(file))) | |
hash = get_hash(file) | |
if verify: | |
if hash[1] is None: | |
print('{} : {}'.format(file, 'failed to read exif hash')) | |
else: | |
print('{} : {}'.format(file, ('match' if hash[0] == hash[1] else 'mismatch'))) | |
#print('calc {}\nexif {}'.format(hash[0], hash[1] )) | |
else: | |
print('{} {}'.format(hash[0], file)) | |
except Exception as e: | |
print('Failed to hash "{}": {}'.format(file, e), file=sys.stderr) | |
''' | |
Output CSV of path, file and hash | |
''' | |
def hash_list(flist, sep, outfile): | |
print('sep={}'.format(sep), file=outfile) | |
print('Path{0}File{0}Hash'.format(sep), file=outfile) | |
i = 0 | |
failed = 0 | |
seenp = [] | |
for d,f in flist: | |
i += 1 | |
curp = int(i/len(flist)*100) | |
fp = os.path.join(d,f) | |
try: | |
h = get_hash(fp) | |
#h = hashlib.sha256(im.tobytes()).hexdigest() | |
print('{1}{0}{2}{0}{3}'.format(sep, d, f, h), file=outfile) | |
if (i % 1000 == 0) or ((curp % 5 == 0) and (curp not in seenp)) : | |
print('{1} [{2}/{3}%|{4}] : {5}{0}{6}{0}{7}'.format(sep, horo(), i, curp, failed, d, f, h)) | |
seenp.append(curp) | |
except Exception as e: | |
failed += 1 | |
print('{} : /!\ File "{}" failed : {}'.format(horo(), fp, e)) | |
print('{} : Proceded {} files, {} failed'.format(horo(), i, failed)) | |
if __name__ == "__main__": | |
main() | |
''' | |
P:\hotos>py -3 imagehash.py test\1* | |
bef5e7acae400832cae425722a47de3e093f96d6f67bb4e594793f44e61fa61e3a8c76b9af2bc81aab384da902c3c065d384c6d7b56da17fc109c3d256d84ae3 test\1.jpg | |
bef5e7acae400832cae425722a47de3e093f96d6f67bb4e594793f44e61fa61e3a8c76b9af2bc81aab384da902c3c065d384c6d7b56da17fc109c3d256d84ae3 test\1+exif-orientation.jpg | |
P:\hotos>py -3 imagehash.py test\2.jpg "test\2+icc.jpg" | |
ce552bcfd2a2f660d1f1589d9bc489f810cf9d75b55e0f887296ddcad4449936f839b7a6ac641fc7c4f512f6590a7379b2c716b276490ce681b3e3f02f51358f test\2.jpg | |
ce552bcfd2a2f660d1f1589d9bc489f810cf9d75b55e0f887296ddcad4449936f839b7a6ac641fc7c4f512f6590a7379b2c716b276490ce681b3e3f02f51358f test\2+icc.jpg | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment