Last active
July 5, 2022 15:42
-
-
Save whypro/1c9d86171223aa0424ea19401459dba8 to your computer and use it in GitHub Desktop.
Simple and powerful checksum tool for backup.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/env python3 | |
| import os | |
| import sys | |
| import hashlib | |
| import datetime | |
| import fnmatch | |
| def generate(checksum_file_path, excludes=[]): | |
| md5_dict = load_checksum(checksum_file_path) | |
| try: | |
| for entry in walk('./'): | |
| match = False | |
| for exclude in excludes: | |
| if fnmatch.fnmatch(entry.path, exclude): | |
| print(f'{entry.path} exclude for pattern {exclude}.') | |
| match = True | |
| if match: | |
| continue | |
| path_md5 = hashlib.md5(entry.path.encode('utf-8')).hexdigest() | |
| if path_md5 in md5_dict: | |
| print(f'{entry.path} skip md5sum.') | |
| continue | |
| content_md5 = md5_for_file(entry.path) | |
| md5_dict[path_md5] = (entry.path, content_md5) | |
| print(f'{entry.path} {content_md5}') | |
| except BaseException as e: | |
| dump_checksum(md5_dict, checksum_file_path) | |
| raise | |
| else: | |
| dump_checksum(md5_dict, checksum_file_path) | |
| def md5_for_file(path, block_size=2**20): | |
| with open(path, 'rb') as f: | |
| md5 = hashlib.md5() | |
| while True: | |
| data = f.read(block_size) | |
| if not data: | |
| break | |
| md5.update(data) | |
| return md5.hexdigest() | |
| def dump_checksum(md5_dict, checksum_file_path): | |
| with open(checksum_file_path, 'w') as f: | |
| for path, md5 in sorted(md5_dict.values(), key=lambda x: x[0]): | |
| f.write(f'{md5} {path}\n') | |
| def load_checksum(checksum_file_path): | |
| if not os.path.exists(checksum_file_path): | |
| return dict() | |
| md5_dict = dict() | |
| with open(checksum_file_path, 'r') as f: | |
| for line in f: | |
| content_md5 = line[0:32].strip() | |
| if len(content_md5) != 32: | |
| raise Exception('invalid checksum file') | |
| path = line[32+1:].strip() | |
| path_md5 = hashlib.md5(path.encode('utf-8')).hexdigest() | |
| md5_dict[path_md5] = (path, content_md5) | |
| return md5_dict | |
| def walk(path): | |
| for entry in os.scandir(path): | |
| if entry.is_dir(follow_symlinks=False): | |
| yield from walk(entry.path) | |
| else: | |
| yield entry | |
| def dump_result(result_dict, result_file_path): | |
| with open(result_file_path, 'w') as f: | |
| for result, path in sorted(result_dict.values(), key=lambda x: x[1]): | |
| f.write(f'{result} {path}\n') | |
| def load_result(result_file_path): | |
| if not os.path.exists(result_file_path): | |
| return dict() | |
| result_dict = dict() | |
| with open(result_file_path, 'r') as f: | |
| for line in f: | |
| line = line.strip() | |
| for expected_result in ('OK', 'FAILED', 'MISSING'): | |
| if line.startswith(expected_result): | |
| path = line[len(expected_result)+1:].strip() | |
| path_md5 = hashlib.md5(path.encode('utf-8')).hexdigest | |
| result_dict[path_md5] = (expected_result, path) | |
| break | |
| return result_dict | |
| def verify(checksum_file_path, result_file_path): | |
| md5_dict = load_checksum(checksum_file_path) | |
| result_dict = load_result(result_file_path) | |
| try: | |
| for path_md5, values in md5_dict.items(): | |
| if path_md5 in result_dict: | |
| print(f'{path}: SKIPPED') | |
| continue | |
| path, md5 = values | |
| if not os.path.exists(path): | |
| print(f'{path}: FAILED, file not exists') | |
| result_dict[path_md5] = ('MISSING', path) | |
| continue | |
| content_md5 = md5_for_file(path) | |
| if content_md5 != md5: | |
| print(f'{path}: FAILED, checksum mismatch') | |
| result_dict[path_md5] = ('FAILED', path) | |
| continue | |
| print(f'{path}: OK') | |
| result_dict[path_md5] = ('OK', path) | |
| except BaseException as e: | |
| dump_result(result_dict, result_file_path) | |
| raise | |
| else: | |
| dump_result(result_dict, result_file_path) | |
| def usage(): | |
| print('Usage:') | |
| print('\tchecksum generate [CHECKSUM_FILE_PREFIX] [--excludes=PATTERN[,PATTERN,...]]') | |
| print('\tchecksum verify [CHECKSUM_FILE_PREFIX]') | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| today = datetime.datetime.now().strftime('%Y%m%d') | |
| if len(sys.argv) > 1: | |
| if sys.argv[1] == "generate": | |
| excludes = ['./*.md5', './*.result', './.deleted/*'] | |
| checksum_file_path = f'{today}.md5' | |
| if len(sys.argv) > 2: | |
| checksum_file_path = f'{sys.argv[2]}.md5' | |
| if len(sys.argv) > 3: | |
| if not sys.argv[3].startswith('--excludes'): | |
| usage() | |
| exclude_args = sys.argv[3][len('--excludes'):] | |
| exclude_args = exclude_args.lstrip('=') | |
| excludes.extend(exclude_args.split(',')) | |
| generate(checksum_file_path=checksum_file_path, excludes=excludes) | |
| elif sys.argv[1] == "verify": | |
| checksum_file_path = f'{today}.md5' | |
| result_file_path = f'{today}.result' | |
| if len(sys.argv) > 2: | |
| checksum_file_path = f'{sys.argv[2]}.md5' | |
| result_file_path = f'{sys.argv[2]}.result' | |
| verify(checksum_file_path=checksum_file_path, result_file_path=result_file_path) | |
| else: | |
| usage() | |
| else: | |
| usage() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment