Skip to content

Instantly share code, notes, and snippets.

@ArjunRayGA
Last active January 21, 2018 05:12
Show Gist options
  • Select an option

  • Save ArjunRayGA/4475053bf248d8c762950072f2edaeee to your computer and use it in GitHub Desktop.

Select an option

Save ArjunRayGA/4475053bf248d8c762950072f2edaeee to your computer and use it in GitHub Desktop.
compare SHA checksum values between two files/dirs
#!/usr/bin/env python
'''
SHA FILE/TREE COMPARATOR
takes two args:
file/dir string 1
file/dir string 2
compares all files in both locations for SHA checksum matches.
// Only verified working on Darwin Kernel Version 17.3.0 //
'''
import subprocess
import sys
from termcolor import cprint
__author__ = "Arjun Ray"
__license__ = "MIT"
__version__ = "1.0.0"
__email__ = "deconstructionalism@gmail.com"
MATCH_SUCCESS_STR = u'''\
\u2713 "{fa}": {fa_path}
== (SHARED SHA: {sha1})
\u2713 "{fb}": {fb_path}
'''
MATCH_FAILURE_STR = u'''\
\u2717 "{fa}": {fa_path}
not in ({fa} SHA: {sha1})
\u2717 "{fb}"
'''
PRINT_ERR_BOLD = lambda x: cprint(x, 'red', attrs=['bold'])
PRINT_ERR = lambda x: cprint(x, 'red')
PRINT_SUC_BOLD = lambda x: cprint(x, 'green', attrs=['bold'])
PRINT_SUC = lambda x: cprint(x, 'green')
PRINT_BOLD = lambda x: cprint(x, attrs=['bold'])
# modify these strings if you want to use a different find and sha shell method
SHELL_FIND_STR = 'find ./{arg} -type f -print0'
SHELL_SHA_STR = 'xargs -0 shasum'
class CompareSHA(object):
'''
Object with "public" methods 'get_shas' to get sha info for all files in
both args and 'compare_shas' to compare all sha checksums between all files
'''
def __init__(self, arg1, arg2):
if arg1 == arg2:
raise ValueError('both file/dir args are the same')
self.arg1 = arg1
self.arg2 = arg2
self.__sha_outputs = []
self.files_dict = {}
self.__compared = []
self.matches = 0
self.no_matches = 0
def get_shas(self):
'''
get sha info for all files in both args, saved in 'self.files_dict'
'''
self.__run_shas()
self.__output_to_dict()
return self
def __run_shas(self):
'''
runs bash command in SHELL_FIND_STR, pipes result to SHELL_SHA_STR and
captures the output in 'self.__sha_outputs'
'''
for arg in [self.arg1, self.arg2]:
find_args = [el.format(arg=arg) for el in SHELL_FIND_STR.split()]
find = subprocess.Popen(find_args,
stdout=subprocess.PIPE)
sha = subprocess.Popen(SHELL_SHA_STR.split(),
stdin=find.stdout,
stdout=subprocess.PIPE)
read_out = sha.communicate()[0] \
.decode("utf-8") \
.strip() \
.split('\n')
output = {
'arg': arg,
'output': read_out
}
self.__sha_outputs.append(output)
return self
def __output_to_dict(self):
'''
converts data in 'self.__sha_outputs' to a dict at 'self.files_dict' of
format:
{
arg1: {
'shas': {
'sha_val1': 'file_path1',
'sha_val2': 'file_path2',
...
'sha_valn': 'file_pathn'
},
'num_files' [number of entires in 'shas' key]
},
arg2: {
""
}
}
'''
for i, output_dict in enumerate(self.__sha_outputs):
sha_output = [[el.strip() for el in line.split(' ', 1)]
for line in output_dict['output']]
arg = output_dict['arg']
has_files = all([item[0] != '' for item in sha_output])
if not has_files:
self.files_dict = {}
raise ImportError('no files found at ARG{}: "{}"'
.format(i, arg))
shas = {k: v for (k, v) in sha_output}
num_files = len(shas)
self.files_dict[arg] = {
'shas': shas,
'num_files': num_files
}
return self
def compare_shas(self):
'''
for each sha/file entry in 'self.files_dict', will compare sha keys for
matches. if a match is found, the paths are printed as matches along
with the sha value and the sha value is removed from matching. matching
starts by comparing all the files from the argument with more files,
then compares the remainig files for the other argument's files
'''
assert self.files_dict != {}, 'you must run \'get_shas\' first'
args_by_len = sorted([[v['num_files'], k]
for k, v in self.files_dict.items()],
reverse=True)
fa_name = args_by_len[0][1]
fb_name = args_by_len[1][1]
file_a = self.files_dict[fa_name]
file_b = self.files_dict[fb_name]
self.__compare_shas_per_files(file_a, file_b, fa_name, fb_name)
file_a, file_b, fa_name, fb_name = file_b, file_a, fb_name, fa_name
self.__compare_shas_per_files(file_a, file_b, fa_name, fb_name)
PRINT_BOLD('SUMMARY\n{}'.format('-' * 20))
PRINT_SUC_BOLD('{} MATCHES'.format(self.matches))
if self.no_matches > 0:
PRINT_ERR_BOLD('{} NO MATCHES'.format(self.no_matches))
def __compare_shas_per_files(self, file_a, file_b, fa_name, fb_name):
'''
tries to match sha values in file_a to sha values in file_b, and prints
match/no-match
'''
if set(file_a['shas'].keys()).issubset(self.__compared):
return
PRINT_BOLD('COMPARING FILES IN "{}" TO "{}"\n{}'
.format(fa_name, fb_name, '-' * 40))
format_dict = {
'fa': fa_name,
'fb': fb_name
}
for sha, file_path in file_a['shas'].items():
format_dict.update({
'fa_path': file_path,
'sha1': sha
})
if sha not in self.__compared:
self.__compared.append(sha)
if sha in file_b['shas'].keys():
format_dict.update({
'fb_path': file_b['shas'][sha]
})
PRINT_SUC(MATCH_SUCCESS_STR.format(**format_dict))
self.matches += 1
else:
PRINT_ERR(MATCH_FAILURE_STR.format(**format_dict))
self.no_matches += 1
def main(arg1, arg2):
'''
instantiates 'ComapreSHA' from command line args and runs 'get_shas' and
'compare_shas'
'''
shacheck = CompareSHA(arg1, arg2)
shacheck.get_shas().compare_shas()
if __name__ == '__main__':
if len(sys.argv) != 3:
PRINT_ERR_BOLD('2 args must be passed: file1/path1 and file2/path2')
else:
main(sys.argv[1], sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment