Last active
January 21, 2018 05:12
-
-
Save ArjunRayGA/4475053bf248d8c762950072f2edaeee to your computer and use it in GitHub Desktop.
compare SHA checksum values between two files/dirs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| ''' | |
| SHA FILE/TREE COMPARATOR | |
| takes two args: | |
| file/dir string 1 | |
| file/dir string 2 | |
| compares all files in both locations for SHA checksum matches. | |
| // Only verified working on Darwin Kernel Version 17.3.0 // | |
| ''' | |
| import subprocess | |
| import sys | |
| from termcolor import cprint | |
| __author__ = "Arjun Ray" | |
| __license__ = "MIT" | |
| __version__ = "1.0.0" | |
| __email__ = "deconstructionalism@gmail.com" | |
| MATCH_SUCCESS_STR = u'''\ | |
| \u2713 "{fa}": {fa_path} | |
| == (SHARED SHA: {sha1}) | |
| \u2713 "{fb}": {fb_path} | |
| ''' | |
| MATCH_FAILURE_STR = u'''\ | |
| \u2717 "{fa}": {fa_path} | |
| not in ({fa} SHA: {sha1}) | |
| \u2717 "{fb}" | |
| ''' | |
| PRINT_ERR_BOLD = lambda x: cprint(x, 'red', attrs=['bold']) | |
| PRINT_ERR = lambda x: cprint(x, 'red') | |
| PRINT_SUC_BOLD = lambda x: cprint(x, 'green', attrs=['bold']) | |
| PRINT_SUC = lambda x: cprint(x, 'green') | |
| PRINT_BOLD = lambda x: cprint(x, attrs=['bold']) | |
| # modify these strings if you want to use a different find and sha shell method | |
| SHELL_FIND_STR = 'find ./{arg} -type f -print0' | |
| SHELL_SHA_STR = 'xargs -0 shasum' | |
| class CompareSHA(object): | |
| ''' | |
| Object with "public" methods 'get_shas' to get sha info for all files in | |
| both args and 'compare_shas' to compare all sha checksums between all files | |
| ''' | |
| def __init__(self, arg1, arg2): | |
| if arg1 == arg2: | |
| raise ValueError('both file/dir args are the same') | |
| self.arg1 = arg1 | |
| self.arg2 = arg2 | |
| self.__sha_outputs = [] | |
| self.files_dict = {} | |
| self.__compared = [] | |
| self.matches = 0 | |
| self.no_matches = 0 | |
| def get_shas(self): | |
| ''' | |
| get sha info for all files in both args, saved in 'self.files_dict' | |
| ''' | |
| self.__run_shas() | |
| self.__output_to_dict() | |
| return self | |
| def __run_shas(self): | |
| ''' | |
| runs bash command in SHELL_FIND_STR, pipes result to SHELL_SHA_STR and | |
| captures the output in 'self.__sha_outputs' | |
| ''' | |
| for arg in [self.arg1, self.arg2]: | |
| find_args = [el.format(arg=arg) for el in SHELL_FIND_STR.split()] | |
| find = subprocess.Popen(find_args, | |
| stdout=subprocess.PIPE) | |
| sha = subprocess.Popen(SHELL_SHA_STR.split(), | |
| stdin=find.stdout, | |
| stdout=subprocess.PIPE) | |
| read_out = sha.communicate()[0] \ | |
| .decode("utf-8") \ | |
| .strip() \ | |
| .split('\n') | |
| output = { | |
| 'arg': arg, | |
| 'output': read_out | |
| } | |
| self.__sha_outputs.append(output) | |
| return self | |
| def __output_to_dict(self): | |
| ''' | |
| converts data in 'self.__sha_outputs' to a dict at 'self.files_dict' of | |
| format: | |
| { | |
| arg1: { | |
| 'shas': { | |
| 'sha_val1': 'file_path1', | |
| 'sha_val2': 'file_path2', | |
| ... | |
| 'sha_valn': 'file_pathn' | |
| }, | |
| 'num_files' [number of entires in 'shas' key] | |
| }, | |
| arg2: { | |
| "" | |
| } | |
| } | |
| ''' | |
| for i, output_dict in enumerate(self.__sha_outputs): | |
| sha_output = [[el.strip() for el in line.split(' ', 1)] | |
| for line in output_dict['output']] | |
| arg = output_dict['arg'] | |
| has_files = all([item[0] != '' for item in sha_output]) | |
| if not has_files: | |
| self.files_dict = {} | |
| raise ImportError('no files found at ARG{}: "{}"' | |
| .format(i, arg)) | |
| shas = {k: v for (k, v) in sha_output} | |
| num_files = len(shas) | |
| self.files_dict[arg] = { | |
| 'shas': shas, | |
| 'num_files': num_files | |
| } | |
| return self | |
| def compare_shas(self): | |
| ''' | |
| for each sha/file entry in 'self.files_dict', will compare sha keys for | |
| matches. if a match is found, the paths are printed as matches along | |
| with the sha value and the sha value is removed from matching. matching | |
| starts by comparing all the files from the argument with more files, | |
| then compares the remainig files for the other argument's files | |
| ''' | |
| assert self.files_dict != {}, 'you must run \'get_shas\' first' | |
| args_by_len = sorted([[v['num_files'], k] | |
| for k, v in self.files_dict.items()], | |
| reverse=True) | |
| fa_name = args_by_len[0][1] | |
| fb_name = args_by_len[1][1] | |
| file_a = self.files_dict[fa_name] | |
| file_b = self.files_dict[fb_name] | |
| self.__compare_shas_per_files(file_a, file_b, fa_name, fb_name) | |
| file_a, file_b, fa_name, fb_name = file_b, file_a, fb_name, fa_name | |
| self.__compare_shas_per_files(file_a, file_b, fa_name, fb_name) | |
| PRINT_BOLD('SUMMARY\n{}'.format('-' * 20)) | |
| PRINT_SUC_BOLD('{} MATCHES'.format(self.matches)) | |
| if self.no_matches > 0: | |
| PRINT_ERR_BOLD('{} NO MATCHES'.format(self.no_matches)) | |
| def __compare_shas_per_files(self, file_a, file_b, fa_name, fb_name): | |
| ''' | |
| tries to match sha values in file_a to sha values in file_b, and prints | |
| match/no-match | |
| ''' | |
| if set(file_a['shas'].keys()).issubset(self.__compared): | |
| return | |
| PRINT_BOLD('COMPARING FILES IN "{}" TO "{}"\n{}' | |
| .format(fa_name, fb_name, '-' * 40)) | |
| format_dict = { | |
| 'fa': fa_name, | |
| 'fb': fb_name | |
| } | |
| for sha, file_path in file_a['shas'].items(): | |
| format_dict.update({ | |
| 'fa_path': file_path, | |
| 'sha1': sha | |
| }) | |
| if sha not in self.__compared: | |
| self.__compared.append(sha) | |
| if sha in file_b['shas'].keys(): | |
| format_dict.update({ | |
| 'fb_path': file_b['shas'][sha] | |
| }) | |
| PRINT_SUC(MATCH_SUCCESS_STR.format(**format_dict)) | |
| self.matches += 1 | |
| else: | |
| PRINT_ERR(MATCH_FAILURE_STR.format(**format_dict)) | |
| self.no_matches += 1 | |
| def main(arg1, arg2): | |
| ''' | |
| instantiates 'ComapreSHA' from command line args and runs 'get_shas' and | |
| 'compare_shas' | |
| ''' | |
| shacheck = CompareSHA(arg1, arg2) | |
| shacheck.get_shas().compare_shas() | |
| if __name__ == '__main__': | |
| if len(sys.argv) != 3: | |
| PRINT_ERR_BOLD('2 args must be passed: file1/path1 and file2/path2') | |
| else: | |
| main(sys.argv[1], sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment