Skip to content

Instantly share code, notes, and snippets.

@u1735067
Last active September 4, 2020 11:50
Show Gist options
  • Save u1735067/642186956632bdecd84989d710d1a813 to your computer and use it in GitHub Desktop.
Save u1735067/642186956632bdecd84989d710d1a813 to your computer and use it in GitHub Desktop.
Unix-like checksummer allowing to use any included Python algorithm
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
import sys, errno, os, argparse, re, stat, hashlib
from collections import OrderedDict
hash_algos = {algo: getattr(hashlib, algo) for algo in hashlib.algorithms_guaranteed}
# Try to use pyblake2 if available and not included in hashlib
if any(blake_algo not in hash_algos for blake_algo in ('blake2b', 'blake2s')):
try:
import pyblake2
hash_algos.update({
'blake2b': pyblake2.blake2b,
'blake2s': pyblake2.blake2s,
})
except ImportError:
pass
# Sort & filter shake algos because of variable length
# Unix : https://metacpan.org/source/MSHELOR/Digest-SHA3-1.04/src/sha3.h#L70
# https://metacpan.org/source/MSHELOR/Digest-SHA3-1.04/sha3sum#L220
# -> output 1344b (336 hexchr) & 1088b (272 hexchr)
# Why ??
# https://keccak.team/software.html
# https://crypto.stackexchange.com/questions/43718/if-the-output-size-of-shake128-256-is-variable-why-is-the-security-fixed-at-128
# Because of https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Algorithm-Validation-Program/documents/sha3/sha3vs.pdf ?
# OpenSSL https://github.com/openssl/openssl/blob/OpenSSL_1_1_1g/crypto/evp/m_sha3.c#L382
# https://www.openssl.org/docs/man1.1.1/man3/EVP_sha3_224.html
# -> output 128b (32 hexchr) & 256b (64 hexchr)
hash_algos = OrderedDict((algo, hash_algos[algo]) for algo in sorted(hash_algos) if 'shake' not in algo)
LongPathsEnabled = None
if os.name == 'nt':
import ctypes
LongPathsEnabled = False
ntdll = ctypes.WinDLL('ntdll')
if hasattr(ntdll, 'RtlAreLongPathsEnabled'):
ntdll.RtlAreLongPathsEnabled.restype = ctypes.c_ubyte
ntdll.RtlAreLongPathsEnabled.argtypes = ()
LongPathsEnabled = bool(ntdll.RtlAreLongPathsEnabled())
class ArgumentDefaultsFileHelpFormatter(argparse.HelpFormatter):
def _get_help_string(self, action):
help = action.help
if '%(default)' not in action.help and action.default is not argparse.SUPPRESS and action.default is not None: # ==SUPPRESS==
if action.option_strings or action.nargs in [argparse.OPTIONAL, argparse.ZERO_OR_MORE]:
if action.default.__class__.__name__ == 'TextIOWrapper':
help += ' (default: {})'.format(action.default.name)
else:
help += ' (default: %(default)s)'
return help
def main():
if LongPathsEnabled is False:
print(r'''
/!\ LongPathsEnabled is not set, you'll face issue for paths longer than 255 characters.
If you are under Win10 >= 1607, please consider setting
HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\FileSystem\LongPathsEnabled=1
More informations at the end of the sources of this script.
''', file=sys.stderr)
parser = argparse.ArgumentParser(
formatter_class=ArgumentDefaultsFileHelpFormatter,
description='Unix-like checksummer allowing to use any included Python algorithm',
epilog='Files are read in binary mode, text mode makes no sense ..'
)
parser.add_argument('base_paths', metavar='BASE_PATH', nargs='+', # type=str, nargs=None,
help='base directory for the logging')
parser.add_argument('-a', '--algos', nargs='+',
choices=sorted(hash_algos, key=OrderedBrowser.sort_natural),
default=hash_algos,
help='algorithm(s) to hash files with')
parser.add_argument('-r', '--recurse', action='store_true', help='process sub-directories')
parser.add_argument('-L', '--dereference', action='store_true', help='follow symbolic links')
parser.add_argument('--tag', action='store_true', help='use BSD style output "algo (path) = hash"')
parser_order = parser.add_argument_group('processing order')
parser_order.add_argument('-s', '--sort', choices=['no', 'lexicographical', 'natural'], default='natural',
help='elements sorting method')
parser_order.add_argument('-f', '--folders', choices=['first', 'last', 'with-files'], default='with-files',
help='handling of the folders in the output')
parser_order.add_argument('-d', '--dot-folders', choices=['same', 'separated', 'excepted'], default='same',
help='with separated, dot folders will be placed with dot files (first or last _in_ dot files); '
'with excepted, they will be treated like dot files')
parser_output = parser.add_argument_group('output related')
parser_output.add_argument('-v', '--stats', action='store_true', help='display stats on stderr at the end')
# https://docs.python.org/3/library/functions.html#open
parser_output.add_argument('-o', '--output', metavar='FILE', type=argparse.FileType('w'), default=sys.stdout,
help='output the hash to a file (UTF-8 with BOM)')
args = parser.parse_args()
chksum = Checksummer(
algos=args.algos,
browser=OrderedBrowser(policy=args),
recurse=args.recurse,
out=args.output,
tag_style=args.tag
)
for base_path in args.base_paths:
chksum.checksum(base_path)
if args.stats:
stats = chksum.get_stats()
if args.recurse:
print('Hashed {} files in {} directories for a total of {} {}'.format(
stats['files'], stats['dirs'], *size_to_human(stats['bytes'])
), file=sys.stderr
)
else:
print('Hashed {} files for a total of {} {}'.format(
stats['files'], *size_to_human(stats['bytes'])
), file=sys.stderr
)
def size_to_human(size, formatted_number=True, units_iec=True):
prefixes = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB']
prefix_index = 0
multiplier = 1024 if units_iec else 1000
while size >= multiplier and prefix_index < len(prefixes) - 1:
size /= multiplier
prefix_index += 1
# 3.00 -> 3, 3.141 => 3.14 - https://stackoverflow.com/questions/2440692/formatting-floats-in-python-without-superfluous-zeros
if formatted_number:
size = format(size, '.2f').rstrip('0').rstrip('.')
prefix = prefixes[prefix_index] if units_iec else prefixes[prefix_index].replace('i', '')
return (size, prefix)
class OrderedBrowser:
def __init__(self, policy=None):
if policy is not None:
if policy.sort == 'no':
self.sort_algo = None
elif policy.sort == 'natural':
self.sort_algo = self.sort_natural
elif policy.sort == 'lexicographical':
self.sort_algo = self.sort_lexicographical
else:
raise Exception('Unknown sort')
# Check done by argparse
self.folders = policy.folders
self.dot_folders = policy.dot_folders
else:
self.sort_algo = self.sort_natural
self.folders = 'last'
self.dot_folders = 'same'
self.scandir_method = os.scandir if hasattr(os, 'scandir') else self._fake_scandir
@staticmethod
def sort_natural(s, _nsre=re.compile('([0-9]+)')):
return [
int(text) if text.isdigit() else text.lower() if (i > 0 or len(text)) else '0'
for i, text in enumerate(re.split(_nsre, s))
]
@staticmethod
def sort_lexicographical(s):
return s
@staticmethod
def _fake_scandir(path):
return map(lambda entry: FakeDirEntry(os.sep.join((path, entry))), os.listdir(path))
def scandir(self, path, callback):
try:
entries = self.scandir_method(path)
except Exception as e:
print('Failed to list "{}": {}'.format(path, e), file=sys.stderr)
return
if self.sort_algo is not None:
entries = list(entries)
entries.sort(key=lambda dir_entry: self.sort_algo(dir_entry.name))
if self.folders in ['first', 'last']:
dirs = []
files = []
dot_dirs = []
dot_files = []
# Simplify logic - https://stackoverflow.com/questions/986006/how-do-i-pass-a-variable-by-reference
if self.dot_folders == 'excepted': # dot_dirs will actually fill dot_files
dot_dirs = dot_files
if self.dot_folders == 'same': # actually fill dirs and files
dot_dirs = dirs
dot_files = files
# Separate elements
for entry in entries:
# https://stackoverflow.com/questions/949098/python-split-a-list-based-on-a-condition
# Avoid an aweful if/else stack ; /!\ false=[0], true=[1]
((files, dirs), (dot_files, dot_dirs))[entry.name[0] == '.'][entry.is_dir(follow_symlinks=False)].append(entry)
# Handle logic
if self.dot_folders == 'excepted':
callback(dot_files)
if self.folders == 'first':
if self.dot_folders == 'separated':
callback(dot_dirs)
callback(dot_files)
callback(dirs)
callback(files)
else:
if self.dot_folders == 'separated':
callback(dot_files)
callback(dot_dirs)
callback(files)
callback(dirs)
else:
callback(entries)
# To allow simpler bootstrap – https://docs.python.org/3/library/os.html#os.DirEntry
class FakeDirEntry:
def __init__(self, file_path):
from os.path import basename
self.path = file_path if os.name == 'nt' and len(file_path) == 3 and file_path[1] == ':' else file_path.rstrip(r'\/')
self.name = basename(self.path)
self.parent = self.path[:-(len(self.name) + 1)]
self._cached_stat = [None, None]
self.stat(follow_symlinks=False) # Populate cache & exception directly if invalid path
def inode(self):
self.stat(follow_symlinks=False).st_ino
def is_dir(self, follow_symlinks=True):
return stat.S_ISDIR(self.stat(follow_symlinks=follow_symlinks).st_mode)
def is_file(self, follow_symlinks=True):
return stat.S_ISREG(self.stat(follow_symlinks=follow_symlinks).st_mode)
def is_symlink(self):
return stat.S_ISLNK(self.stat(follow_symlinks=False).st_mode)
def stat(self, follow_symlinks=True):
if self._cached_stat[follow_symlinks] is not None:
return self._cached_stat[follow_symlinks]
else:
self._cached_stat[follow_symlinks] = os.stat(self.path) if follow_symlinks else os.lstat(self.path)
return self._cached_stat[follow_symlinks]
class Checksummer:
chunk_size = 64 * 1024 # 64k
def __init__(self, algos, browser=OrderedBrowser(), recurse=False, dereference=False, out=sys.stdout, tag_style=False):
self.algos = OrderedDict((algo, self._algo_code2tag(algo)) for algo in algos)
self.browser = browser
self.recurse = recurse
self.dereference = dereference
self.out = out
if tag_style or len(self.algos) > 1:
self.out_template = '{algo} ({path}) = {hash}'
else:
self.out_template = '{hash} {path}'
self.tag_style = tag_style
# Init stats
self.total_dirs = self.total_files = self.total_files_size = 0
def checksum(self, path):
first = FakeDirEntry(path)
if first.is_file(follow_symlinks=self.dereference):
self._process_entry(first, recurse=False)
elif first.is_dir(follow_symlinks=self.dereference):
self.browser.scandir(path, lambda entries: self._process_entries(entries, self.recurse))
else:
raise Exception("Probably trying to run on a symlink, sorry they're not followed unless you ask for it")
@staticmethod
def _algo_code2tag(algo):
algo = algo.replace('_', '-')
i = 0
for i, c in enumerate(algo):
if c.isdigit():
break
return algo[:i].upper() + algo[i:]
def get_stats(self):
return {
'dirs': self.total_dirs,
'files': self.total_files,
'bytes': self.total_files_size,
}
def _process_entries(self, entries, recurse=True):
for entry in entries:
self._process_entry(entry, recurse)
def _process_entry(self, entry, recurse=True):
if recurse and entry.is_dir(follow_symlinks=self.dereference):
self.total_dirs += 1
self.browser.scandir(entry.path, self._process_entries)
elif entry.is_file(follow_symlinks=self.dereference):
try:
hash_objs = {algo: hash_algos[algo]() for algo in self.algos}
with open(entry.path, 'rb') as entry_handle:
# read() would try to fit the file in memory ..
# https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file
# https://docs.python.org/3/library/functions.html#iter
for chunk in iter(lambda: entry_handle.read(self.chunk_size), b''):
for hash_obj in hash_objs.values():
hash_obj.update(chunk)
entry_size = entry_handle.tell() # Try to avoid to stat as we already know the size
except Exception as e:
print('Error hashing "{}": {}({})'.format(entry.path, e.__class__.__name__, e), file=sys.stderr)
else:
for algo_code, algo_tag in self.algos.items():
print(self.out_template.format(
algo=algo_tag, path=entry.path, hash=hash_objs[algo_code].hexdigest()
),
file=self.out
)
if len(self.algos) > 1:
print(file=self.out)
self.total_files += 1
self.total_files_size += entry_size
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
pass
except IOError as e:
if e.errno != errno.EPIPE:
raise
r'''
About long paths on Windows:
- https://bugs.python.org/issue18199
- https://bugs.python.org/issue27731
- https://blogs.msdn.microsoft.com/jeremykuhne/2016/06/21/more-on-new-net-path-handling/
- https://blogs.msdn.microsoft.com/jeremykuhne/2016/07/30/net-4-6-2-and-long-paths-on-windows-10/
- https://lifehacker.com/windows-10-allows-file-names-longer-than-260-characters-1785201032
- https://betanews.com/2016/05/29/long-paths-windows-10/
- https://social.msdn.microsoft.com/Forums/en-US/fc85630e-5684-4df6-ad2f-5a128de3deef/260-character-explorer-path-length-limit?forum=windowsgeneraldevelopmentissues
- https://msdn.microsoft.com/en-us/library/aa365247%28VS.85%29.aspx?f=255&MSPPError=-2147217396#maxpath
- https://msdn.microsoft.com/en-us/library/windows/desktop/aa374191(v=vs.85).aspx
- https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx#maxpath / https://msdn.microsoft.com/en-us/library/aa365247(VS.85).aspx#maxpath
- https://github.com/python/cpython/blob/master/PC/python.manifest
- https://stackoverflow.com/questions/36219317/pathname-too-long-to-open/36219497
Contrary to what aa365247 says ("can also"), both the registry key AND the manifest are required ..
Alternative solutions (workaround / hacks):
- https://docs.python.org/3/library/os.html#os.listdir
handle paths manually, with listdir for example, adding '\\?\' when required ; this is heavy, you must carry the path
- https://docs.python.org/3/library/os.html#os.scandir
scandir cannot be used directly with '\\?\', you lose the relative path, or
using it with relative calls (cd each time) + other calls using '\\?\' you lose the interest of scandir
(and it may not return the informations without \\.\ on long paths)
- https://docs.python.org/3/library/pathlib.html / https://www.python.org/dev/peps/pep-0428/ / https://github.com/python/cpython/blob/3.6/Lib/pathlib.py
pathlib doesn't handle '\\?\' and doesn't allow to chdir by itself ; also follows symlinks by default and doesn't use scandir
Also, even if you cd for each browse to make open('short-name'), the full path can still cause issue. '\\.\' notation have to be used.
And there's normalization things required as explained in the issues / blogs.
And there's probably more, see https://bugs.python.org/issue18199#msg191035
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment