Last active
December 26, 2015 18:39
-
-
Save PeterDing/7195865 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2 | |
# vim: set fileencoding=utf8 | |
import os, sys, json, datetime | |
s = u'\x1b[1;%dm%s\x1b[0m' # terminual color template | |
types = ('.MKV', '.FLV', '.AVI', '.RMVB', '.MPG', '.WMV', '.RM', '.mkv', '.flv', '.avi', '.rmvb', '.mpg', '.wmv', '.rm') | |
class compare(object): | |
def __init__(self, dir_): | |
self.dir_ = dir_ | |
self.data = {} | |
self.yeast = 0 | |
self.files_md5 = json.loads(open(os.path.join(os.path.expanduser('~'), '.files_md5')).read()) | |
def get_md5(self, file_path): | |
t = os.popen('md5sum "%s"' % file_path.encode('utf8').replace('$', '\\$')).read().split() | |
if t: | |
md5 = t[0] | |
return md5.decode('utf8') | |
else: | |
return u'' | |
def report(self): | |
data = self.data | |
with open(os.path.join(os.path.expanduser('~'), 'duplicate_files_finder %s' % str(datetime.datetime.now())), 'w') as g: | |
g.write('## generating at %s\n' % str(datetime.datetime.now())) | |
g.write('## duplicate files at %s\n' % self.dir_) | |
g.write('#####################################\n\n') | |
for i in data: | |
if len(data[i]) > 1: | |
self.yeast += 1 | |
g.write('-'*20 + '\n' + '\n'.join(data[i]).encode('utf8') + '\n\n') | |
def backup(self): | |
with open(os.path.join(os.path.expanduser('~'), '.files_md5'), 'w') as g: | |
g.write(json.dumps(self.files_md5, sort_keys=True, indent=4)) | |
def filter_type(self, file_path): | |
suffix = os.path.splitext(file_path) | |
auth = suffix in types or file_path[-1] == '~' | |
return auth | |
def analysis(self): | |
for a,b,c in os.walk(self.dir_): | |
for i in c: | |
file_path = os.path.join(a, i).decode('utf8') | |
#chuck = os.popen('head %s' % file_path).read() | |
if not os.path.islink(file_path): | |
if file_path in self.files_md5: | |
md5 = self.files_md5[file_path] | |
if md5 not in self.data: | |
self.data[md5] = [] | |
self.data[md5].append(file_path.replace(self.dir_ + '/', './')) | |
else: | |
auth = self.filter_type(file_path) | |
if not auth: | |
md5 = self.get_md5(file_path) | |
if not md5: | |
print s % (91, ' Warning'), ': %s is an empty file.' % file_path | |
if md5 not in self.data: | |
self.data[md5] = [] | |
self.data[md5].append(file_path.replace(self.dir_ + '/', './')) | |
self.files_md5[file_path] = md5 | |
if __name__ == '__main__': | |
d = sys.argv[1] | |
dir_ = os.path.abspath(d) | |
if not os.path.exists(os.path.join(os.path.expanduser('~'), '.files_md5')): | |
with open(os.path.join(os.path.expanduser('~'), '.files_md5'), 'w') as g: | |
g.write(json.dumps({})) | |
else: | |
if len(open(os.path.join(os.path.expanduser('~'), '.files_md5')).read()) == 0: | |
with open(os.path.join(os.path.expanduser('~'), '.files_md5'), 'w') as g: | |
g.write(json.dumps({})) | |
x = compare(dir_) | |
x.analysis() | |
x.report() | |
x.backup() | |
print s % (92, ' finding'), s % (91, '%d' % x.yeast), s % (92, 'teams where each file has same md5 value.') | |
print s % (94, ' see report at ~/duplicate_files_finder') | |
print s % (92, ' # over') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment