Skip to content

Instantly share code, notes, and snippets.

@PeterDing
Last active December 26, 2015 18:39
Show Gist options
  • Save PeterDing/7195865 to your computer and use it in GitHub Desktop.
Save PeterDing/7195865 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python2
# vim: set fileencoding=utf8
import os, sys, json, datetime
s = u'\x1b[1;%dm%s\x1b[0m' # terminual color template
types = ('.MKV', '.FLV', '.AVI', '.RMVB', '.MPG', '.WMV', '.RM', '.mkv', '.flv', '.avi', '.rmvb', '.mpg', '.wmv', '.rm')
class compare(object):
def __init__(self, dir_):
self.dir_ = dir_
self.data = {}
self.yeast = 0
self.files_md5 = json.loads(open(os.path.join(os.path.expanduser('~'), '.files_md5')).read())
def get_md5(self, file_path):
t = os.popen('md5sum "%s"' % file_path.encode('utf8').replace('$', '\\$')).read().split()
if t:
md5 = t[0]
return md5.decode('utf8')
else:
return u''
def report(self):
data = self.data
with open(os.path.join(os.path.expanduser('~'), 'duplicate_files_finder %s' % str(datetime.datetime.now())), 'w') as g:
g.write('## generating at %s\n' % str(datetime.datetime.now()))
g.write('## duplicate files at %s\n' % self.dir_)
g.write('#####################################\n\n')
for i in data:
if len(data[i]) > 1:
self.yeast += 1
g.write('-'*20 + '\n' + '\n'.join(data[i]).encode('utf8') + '\n\n')
def backup(self):
with open(os.path.join(os.path.expanduser('~'), '.files_md5'), 'w') as g:
g.write(json.dumps(self.files_md5, sort_keys=True, indent=4))
def filter_type(self, file_path):
suffix = os.path.splitext(file_path)
auth = suffix in types or file_path[-1] == '~'
return auth
def analysis(self):
for a,b,c in os.walk(self.dir_):
for i in c:
file_path = os.path.join(a, i).decode('utf8')
#chuck = os.popen('head %s' % file_path).read()
if not os.path.islink(file_path):
if file_path in self.files_md5:
md5 = self.files_md5[file_path]
if md5 not in self.data:
self.data[md5] = []
self.data[md5].append(file_path.replace(self.dir_ + '/', './'))
else:
auth = self.filter_type(file_path)
if not auth:
md5 = self.get_md5(file_path)
if not md5:
print s % (91, ' Warning'), ': %s is an empty file.' % file_path
if md5 not in self.data:
self.data[md5] = []
self.data[md5].append(file_path.replace(self.dir_ + '/', './'))
self.files_md5[file_path] = md5
if __name__ == '__main__':
d = sys.argv[1]
dir_ = os.path.abspath(d)
if not os.path.exists(os.path.join(os.path.expanduser('~'), '.files_md5')):
with open(os.path.join(os.path.expanduser('~'), '.files_md5'), 'w') as g:
g.write(json.dumps({}))
else:
if len(open(os.path.join(os.path.expanduser('~'), '.files_md5')).read()) == 0:
with open(os.path.join(os.path.expanduser('~'), '.files_md5'), 'w') as g:
g.write(json.dumps({}))
x = compare(dir_)
x.analysis()
x.report()
x.backup()
print s % (92, ' finding'), s % (91, '%d' % x.yeast), s % (92, 'teams where each file has same md5 value.')
print s % (94, ' see report at ~/duplicate_files_finder')
print s % (92, ' # over')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment