Skip to content

Instantly share code, notes, and snippets.

@whypro
Last active July 5, 2022 15:42
Show Gist options
  • Select an option

  • Save whypro/1c9d86171223aa0424ea19401459dba8 to your computer and use it in GitHub Desktop.

Select an option

Save whypro/1c9d86171223aa0424ea19401459dba8 to your computer and use it in GitHub Desktop.
Simple and powerful checksum tool for backup.
#!/bin/env python3
import os
import sys
import hashlib
import datetime
import fnmatch
def generate(checksum_file_path, excludes=[]):
md5_dict = load_checksum(checksum_file_path)
try:
for entry in walk('./'):
match = False
for exclude in excludes:
if fnmatch.fnmatch(entry.path, exclude):
print(f'{entry.path} exclude for pattern {exclude}.')
match = True
if match:
continue
path_md5 = hashlib.md5(entry.path.encode('utf-8')).hexdigest()
if path_md5 in md5_dict:
print(f'{entry.path} skip md5sum.')
continue
content_md5 = md5_for_file(entry.path)
md5_dict[path_md5] = (entry.path, content_md5)
print(f'{entry.path} {content_md5}')
except BaseException as e:
dump_checksum(md5_dict, checksum_file_path)
raise
else:
dump_checksum(md5_dict, checksum_file_path)
def md5_for_file(path, block_size=2**20):
with open(path, 'rb') as f:
md5 = hashlib.md5()
while True:
data = f.read(block_size)
if not data:
break
md5.update(data)
return md5.hexdigest()
def dump_checksum(md5_dict, checksum_file_path):
with open(checksum_file_path, 'w') as f:
for path, md5 in sorted(md5_dict.values(), key=lambda x: x[0]):
f.write(f'{md5} {path}\n')
def load_checksum(checksum_file_path):
if not os.path.exists(checksum_file_path):
return dict()
md5_dict = dict()
with open(checksum_file_path, 'r') as f:
for line in f:
content_md5 = line[0:32].strip()
if len(content_md5) != 32:
raise Exception('invalid checksum file')
path = line[32+1:].strip()
path_md5 = hashlib.md5(path.encode('utf-8')).hexdigest()
md5_dict[path_md5] = (path, content_md5)
return md5_dict
def walk(path):
for entry in os.scandir(path):
if entry.is_dir(follow_symlinks=False):
yield from walk(entry.path)
else:
yield entry
def dump_result(result_dict, result_file_path):
with open(result_file_path, 'w') as f:
for result, path in sorted(result_dict.values(), key=lambda x: x[1]):
f.write(f'{result} {path}\n')
def load_result(result_file_path):
if not os.path.exists(result_file_path):
return dict()
result_dict = dict()
with open(result_file_path, 'r') as f:
for line in f:
line = line.strip()
for expected_result in ('OK', 'FAILED', 'MISSING'):
if line.startswith(expected_result):
path = line[len(expected_result)+1:].strip()
path_md5 = hashlib.md5(path.encode('utf-8')).hexdigest
result_dict[path_md5] = (expected_result, path)
break
return result_dict
def verify(checksum_file_path, result_file_path):
md5_dict = load_checksum(checksum_file_path)
result_dict = load_result(result_file_path)
try:
for path_md5, values in md5_dict.items():
if path_md5 in result_dict:
print(f'{path}: SKIPPED')
continue
path, md5 = values
if not os.path.exists(path):
print(f'{path}: FAILED, file not exists')
result_dict[path_md5] = ('MISSING', path)
continue
content_md5 = md5_for_file(path)
if content_md5 != md5:
print(f'{path}: FAILED, checksum mismatch')
result_dict[path_md5] = ('FAILED', path)
continue
print(f'{path}: OK')
result_dict[path_md5] = ('OK', path)
except BaseException as e:
dump_result(result_dict, result_file_path)
raise
else:
dump_result(result_dict, result_file_path)
def usage():
print('Usage:')
print('\tchecksum generate [CHECKSUM_FILE_PREFIX] [--excludes=PATTERN[,PATTERN,...]]')
print('\tchecksum verify [CHECKSUM_FILE_PREFIX]')
sys.exit(1)
if __name__ == "__main__":
today = datetime.datetime.now().strftime('%Y%m%d')
if len(sys.argv) > 1:
if sys.argv[1] == "generate":
excludes = ['./*.md5', './*.result', './.deleted/*']
checksum_file_path = f'{today}.md5'
if len(sys.argv) > 2:
checksum_file_path = f'{sys.argv[2]}.md5'
if len(sys.argv) > 3:
if not sys.argv[3].startswith('--excludes'):
usage()
exclude_args = sys.argv[3][len('--excludes'):]
exclude_args = exclude_args.lstrip('=')
excludes.extend(exclude_args.split(','))
generate(checksum_file_path=checksum_file_path, excludes=excludes)
elif sys.argv[1] == "verify":
checksum_file_path = f'{today}.md5'
result_file_path = f'{today}.result'
if len(sys.argv) > 2:
checksum_file_path = f'{sys.argv[2]}.md5'
result_file_path = f'{sys.argv[2]}.result'
verify(checksum_file_path=checksum_file_path, result_file_path=result_file_path)
else:
usage()
else:
usage()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment