Last active
September 4, 2020 11:50
-
-
Save u1735067/642186956632bdecd84989d710d1a813 to your computer and use it in GitHub Desktop.
Unix-like checksummer allowing to use any included Python algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from __future__ import print_function, unicode_literals | |
import sys, errno, os, argparse, re, stat, hashlib | |
from collections import OrderedDict | |
hash_algos = {algo: getattr(hashlib, algo) for algo in hashlib.algorithms_guaranteed} | |
# Try to use pyblake2 if available and not included in hashlib | |
if any(blake_algo not in hash_algos for blake_algo in ('blake2b', 'blake2s')): | |
try: | |
import pyblake2 | |
hash_algos.update({ | |
'blake2b': pyblake2.blake2b, | |
'blake2s': pyblake2.blake2s, | |
}) | |
except ImportError: | |
pass | |
# Sort & filter shake algos because of variable length | |
# Unix : https://metacpan.org/source/MSHELOR/Digest-SHA3-1.04/src/sha3.h#L70 | |
# https://metacpan.org/source/MSHELOR/Digest-SHA3-1.04/sha3sum#L220 | |
# -> output 1344b (336 hexchr) & 1088b (272 hexchr) | |
# Why ?? | |
# https://keccak.team/software.html | |
# https://crypto.stackexchange.com/questions/43718/if-the-output-size-of-shake128-256-is-variable-why-is-the-security-fixed-at-128 | |
# Because of https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/sha3/sha3vs.pdf ? | |
# OpenSSL https://github.com/openssl/openssl/blob/OpenSSL_1_1_1g/crypto/evp/m_sha3.c#L382 | |
# https://www.openssl.org/docs/man1.1.1/man3/EVP_sha3_224.html | |
# -> output 128b (32 hexchr) & 256b (64 hexchr) | |
hash_algos = OrderedDict((algo, hash_algos[algo]) for algo in sorted(hash_algos) if 'shake' not in algo) | |
LongPathsEnabled = None | |
if os.name == 'nt': | |
import ctypes | |
LongPathsEnabled = False | |
ntdll = ctypes.WinDLL('ntdll') | |
if hasattr(ntdll, 'RtlAreLongPathsEnabled'): | |
ntdll.RtlAreLongPathsEnabled.restype = ctypes.c_ubyte | |
ntdll.RtlAreLongPathsEnabled.argtypes = () | |
LongPathsEnabled = bool(ntdll.RtlAreLongPathsEnabled()) | |
class ArgumentDefaultsFileHelpFormatter(argparse.HelpFormatter): | |
def _get_help_string(self, action): | |
help = action.help | |
if '%(default)' not in action.help and action.default is not argparse.SUPPRESS and action.default is not None: # ==SUPPRESS== | |
if action.option_strings or action.nargs in [argparse.OPTIONAL, argparse.ZERO_OR_MORE]: | |
if action.default.__class__.__name__ == 'TextIOWrapper': | |
help += ' (default: {})'.format(action.default.name) | |
else: | |
help += ' (default: %(default)s)' | |
return help | |
def main(): | |
if LongPathsEnabled is False: | |
print(r''' | |
/!\ LongPathsEnabled is not set, you'll face issue for paths longer than 255 characters. | |
If you are under Win10 >= 1607, please consider setting | |
HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem\LongPathsEnabled=1 | |
More informations at the end of the sources of this script. | |
''', file=sys.stderr) | |
parser = argparse.ArgumentParser( | |
formatter_class=ArgumentDefaultsFileHelpFormatter, | |
description='Unix-like checksummer allowing to use any included Python algorithm', | |
epilog='Files are read in binary mode, text mode makes no sense ..' | |
) | |
parser.add_argument('base_paths', metavar='BASE_PATH', nargs='+', # type=str, nargs=None, | |
help='base directory for the logging') | |
parser.add_argument('-a', '--algos', nargs='+', | |
choices=sorted(hash_algos, key=OrderedBrowser.sort_natural), | |
default=hash_algos, | |
help='algorithm(s) to hash files with') | |
parser.add_argument('-r', '--recurse', action='store_true', help='process sub-directories') | |
parser.add_argument('-L', '--dereference', action='store_true', help='follow symbolic links') | |
parser.add_argument('--tag', action='store_true', help='use BSD style output "algo (path) = hash"') | |
parser_order = parser.add_argument_group('processing order') | |
parser_order.add_argument('-s', '--sort', choices=['no', 'lexicographical', 'natural'], default='natural', | |
help='elements sorting method') | |
parser_order.add_argument('-f', '--folders', choices=['first', 'last', 'with-files'], default='with-files', | |
help='handling of the folders in the output') | |
parser_order.add_argument('-d', '--dot-folders', choices=['same', 'separated', 'excepted'], default='same', | |
help='with separated, dot folders will be placed with dot files (first or last _in_ dot files); ' | |
'with excepted, they will be treated like dot files') | |
parser_output = parser.add_argument_group('output related') | |
parser_output.add_argument('-v', '--stats', action='store_true', help='display stats on stderr at the end') | |
# https://docs.python.org/3/library/functions.html#open | |
parser_output.add_argument('-o', '--output', metavar='FILE', type=argparse.FileType('w'), default=sys.stdout, | |
help='output the hash to a file (UTF-8 with BOM)') | |
args = parser.parse_args() | |
chksum = Checksummer( | |
algos=args.algos, | |
browser=OrderedBrowser(policy=args), | |
recurse=args.recurse, | |
out=args.output, | |
tag_style=args.tag | |
) | |
for base_path in args.base_paths: | |
chksum.checksum(base_path) | |
if args.stats: | |
stats = chksum.get_stats() | |
if args.recurse: | |
print('Hashed {} files in {} directories for a total of {} {}'.format( | |
stats['files'], stats['dirs'], *size_to_human(stats['bytes']) | |
), file=sys.stderr | |
) | |
else: | |
print('Hashed {} files for a total of {} {}'.format( | |
stats['files'], *size_to_human(stats['bytes']) | |
), file=sys.stderr | |
) | |
def size_to_human(size, formatted_number=True, units_iec=True): | |
prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'] | |
prefix_index = 0 | |
multiplier = 1024 if units_iec else 1000 | |
while size >= multiplier and prefix_index < len(prefixes) - 1: | |
size /= multiplier | |
prefix_index += 1 | |
# 3.00 -> 3, 3.141 => 3.14 - https://stackoverflow.com/questions/2440692/formatting-floats-in-python-without-superfluous-zeros | |
if formatted_number: | |
size = format(size, '.2f').rstrip('0').rstrip('.') | |
prefix = prefixes[prefix_index] if units_iec else prefixes[prefix_index].replace('i', '') | |
return (size, prefix) | |
class OrderedBrowser: | |
def __init__(self, policy=None): | |
if policy is not None: | |
if policy.sort == 'no': | |
self.sort_algo = None | |
elif policy.sort == 'natural': | |
self.sort_algo = self.sort_natural | |
elif policy.sort == 'lexicographical': | |
self.sort_algo = self.sort_lexicographical | |
else: | |
raise Exception('Unknown sort') | |
# Check done by argparse | |
self.folders = policy.folders | |
self.dot_folders = policy.dot_folders | |
else: | |
self.sort_algo = self.sort_natural | |
self.folders = 'last' | |
self.dot_folders = 'same' | |
self.scandir_method = os.scandir if hasattr(os, 'scandir') else self._fake_scandir | |
@staticmethod | |
def sort_natural(s, _nsre=re.compile('([0-9]+)')): | |
return [ | |
int(text) if text.isdigit() else text.lower() if (i > 0 or len(text)) else '0' | |
for i, text in enumerate(re.split(_nsre, s)) | |
] | |
@staticmethod | |
def sort_lexicographical(s): | |
return s | |
@staticmethod | |
def _fake_scandir(path): | |
return map(lambda entry: FakeDirEntry(os.sep.join((path, entry))), os.listdir(path)) | |
def scandir(self, path, callback): | |
try: | |
entries = self.scandir_method(path) | |
except Exception as e: | |
print('Failed to list "{}": {}'.format(path, e), file=sys.stderr) | |
return | |
if self.sort_algo is not None: | |
entries = list(entries) | |
entries.sort(key=lambda dir_entry: self.sort_algo(dir_entry.name)) | |
if self.folders in ['first', 'last']: | |
dirs = [] | |
files = [] | |
dot_dirs = [] | |
dot_files = [] | |
# Simplify logic - https://stackoverflow.com/questions/986006/how-do-i-pass-a-variable-by-reference | |
if self.dot_folders == 'excepted': # dot_dirs will actually fill dot_files | |
dot_dirs = dot_files | |
if self.dot_folders == 'same': # actually fill dirs and files | |
dot_dirs = dirs | |
dot_files = files | |
# Separate elements | |
for entry in entries: | |
# https://stackoverflow.com/questions/949098/python-split-a-list-based-on-a-condition | |
# Avoid an aweful if/else stack ; /!\ false=[0], true=[1] | |
((files, dirs), (dot_files, dot_dirs))[entry.name[0] == '.'][entry.is_dir(follow_symlinks=False)].append(entry) | |
# Handle logic | |
if self.dot_folders == 'excepted': | |
callback(dot_files) | |
if self.folders == 'first': | |
if self.dot_folders == 'separated': | |
callback(dot_dirs) | |
callback(dot_files) | |
callback(dirs) | |
callback(files) | |
else: | |
if self.dot_folders == 'separated': | |
callback(dot_files) | |
callback(dot_dirs) | |
callback(files) | |
callback(dirs) | |
else: | |
callback(entries) | |
# To allow simpler bootstrap – https://docs.python.org/3/library/os.html#os.DirEntry | |
class FakeDirEntry: | |
def __init__(self, file_path): | |
from os.path import basename | |
self.path = file_path if os.name == 'nt' and len(file_path) == 3 and file_path[1] == ':' else file_path.rstrip(r'\/') | |
self.name = basename(self.path) | |
self.parent = self.path[:-(len(self.name) + 1)] | |
self._cached_stat = [None, None] | |
self.stat(follow_symlinks=False) # Populate cache & exception directly if invalid path | |
def inode(self): | |
self.stat(follow_symlinks=False).st_ino | |
def is_dir(self, follow_symlinks=True): | |
return stat.S_ISDIR(self.stat(follow_symlinks=follow_symlinks).st_mode) | |
def is_file(self, follow_symlinks=True): | |
return stat.S_ISREG(self.stat(follow_symlinks=follow_symlinks).st_mode) | |
def is_symlink(self): | |
return stat.S_ISLNK(self.stat(follow_symlinks=False).st_mode) | |
def stat(self, follow_symlinks=True): | |
if self._cached_stat[follow_symlinks] is not None: | |
return self._cached_stat[follow_symlinks] | |
else: | |
self._cached_stat[follow_symlinks] = os.stat(self.path) if follow_symlinks else os.lstat(self.path) | |
return self._cached_stat[follow_symlinks] | |
class Checksummer: | |
chunk_size = 64 * 1024 # 64k | |
def __init__(self, algos, browser=OrderedBrowser(), recurse=False, dereference=False, out=sys.stdout, tag_style=False): | |
self.algos = OrderedDict((algo, self._algo_code2tag(algo)) for algo in algos) | |
self.browser = browser | |
self.recurse = recurse | |
self.dereference = dereference | |
self.out = out | |
if tag_style or len(self.algos) > 1: | |
self.out_template = '{algo} ({path}) = {hash}' | |
else: | |
self.out_template = '{hash} {path}' | |
self.tag_style = tag_style | |
# Init stats | |
self.total_dirs = self.total_files = self.total_files_size = 0 | |
def checksum(self, path): | |
first = FakeDirEntry(path) | |
if first.is_file(follow_symlinks=self.dereference): | |
self._process_entry(first, recurse=False) | |
elif first.is_dir(follow_symlinks=self.dereference): | |
self.browser.scandir(path, lambda entries: self._process_entries(entries, self.recurse)) | |
else: | |
raise Exception("Probably trying to run on a symlink, sorry they're not followed unless you ask for it") | |
@staticmethod | |
def _algo_code2tag(algo): | |
algo = algo.replace('_', '-') | |
i = 0 | |
for i, c in enumerate(algo): | |
if c.isdigit(): | |
break | |
return algo[:i].upper() + algo[i:] | |
def get_stats(self): | |
return { | |
'dirs': self.total_dirs, | |
'files': self.total_files, | |
'bytes': self.total_files_size, | |
} | |
def _process_entries(self, entries, recurse=True): | |
for entry in entries: | |
self._process_entry(entry, recurse) | |
def _process_entry(self, entry, recurse=True): | |
if recurse and entry.is_dir(follow_symlinks=self.dereference): | |
self.total_dirs += 1 | |
self.browser.scandir(entry.path, self._process_entries) | |
elif entry.is_file(follow_symlinks=self.dereference): | |
try: | |
hash_objs = {algo: hash_algos[algo]() for algo in self.algos} | |
with open(entry.path, 'rb') as entry_handle: | |
# read() would try to fit the file in memory .. | |
# https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file | |
# https://docs.python.org/3/library/functions.html#iter | |
for chunk in iter(lambda: entry_handle.read(self.chunk_size), b''): | |
for hash_obj in hash_objs.values(): | |
hash_obj.update(chunk) | |
entry_size = entry_handle.tell() # Try to avoid to stat as we already know the size | |
except Exception as e: | |
print('Error hashing "{}": {}({})'.format(entry.path, e.__class__.__name__, e), file=sys.stderr) | |
else: | |
for algo_code, algo_tag in self.algos.items(): | |
print(self.out_template.format( | |
algo=algo_tag, path=entry.path, hash=hash_objs[algo_code].hexdigest() | |
), | |
file=self.out | |
) | |
if len(self.algos) > 1: | |
print(file=self.out) | |
self.total_files += 1 | |
self.total_files_size += entry_size | |
if __name__ == '__main__': | |
try: | |
main() | |
except KeyboardInterrupt: | |
pass | |
except IOError as e: | |
if e.errno != errno.EPIPE: | |
raise | |
r''' | |
About long paths on Windows: | |
- https://bugs.python.org/issue18199 | |
- https://bugs.python.org/issue27731 | |
- https://blogs.msdn.microsoft.com/jeremykuhne/2016/06/21/more-on-new-net-path-handling/ | |
- https://blogs.msdn.microsoft.com/jeremykuhne/2016/07/30/net-4-6-2-and-long-paths-on-windows-10/ | |
- https://lifehacker.com/windows-10-allows-file-names-longer-than-260-characters-1785201032 | |
- https://betanews.com/2016/05/29/long-paths-windows-10/ | |
- https://social.msdn.microsoft.com/Forums/en-US/fc85630e-5684-4df6-ad2f-5a128de3deef/260-character-explorer-path-length-limit?forum=windowsgeneraldevelopmentissues | |
- https://msdn.microsoft.com/en-us/library/aa365247%28VS.85%29.aspx?f=255&MSPPError=-2147217396#maxpath | |
- https://msdn.microsoft.com/en-us/library/windows/desktop/aa374191(v=vs.85).aspx | |
- https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx#maxpath / https://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx#maxpath | |
- https://github.com/python/cpython/blob/master/PC/python.manifest | |
- https://stackoverflow.com/questions/36219317/pathname-too-long-to-open/36219497 | |
Contrary to what aa365247 says ("can also"), both the registry key AND the manifest are required .. | |
Alternative solutions (workaround / hacks): | |
- https://docs.python.org/3/library/os.html#os.listdir | |
handle paths manually, with listdir for example, adding '\\?\' when required ; this is heavy, you must carry the path | |
- https://docs.python.org/3/library/os.html#os.scandir | |
scandir cannot be used directly with '\\?\', you lose the relative path, or | |
using it with relative calls (cd each time) + other calls using '\\?\' you lose the interest of scandir | |
(and it may not return the informations without \\.\ on long paths) | |
- https://docs.python.org/3/library/pathlib.html / https://www.python.org/dev/peps/pep-0428/ / https://github.com/python/cpython/blob/3.6/Lib/pathlib.py | |
pathlib doesn't handle '\\?\' and doesn't allow to chdir by itself ; also follows symlinks by default and doesn't use scandir | |
Also, even if you cd for each browse to make open('short-name'), the full path can still cause issue. '\\.\' notation have to be used. | |
And there's normalization things required as explained in the issues / blogs. | |
And there's probably more, see https://bugs.python.org/issue18199#msg191035 | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment