Created
September 18, 2012 23:42
-
-
Save norrs/3746757 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/home/rockj/pysh/bin/python | |
"""Remove duplicate emails | |
Script that tries as good as it can to delete duplicates based | |
on the 'message-id' mail header and diffing mail body of 'message-id' | |
duplicatse. | |
This script do the following: | |
Loops thru every mailfile, append em in a hash which is keyed on the | |
header 'message-id'. If multiple files is found with the same header, | |
it is appended to the list for the current key. | |
After this process is done, we loop thru and do some work on the keys | |
who have more then 1 entries, where we do the following: | |
Fetches the first mail and use it as a base to compare against the others. | |
Comparing is done by extracting the mail body and running diff against | |
each other bodys. | |
If the body is different, we will be skipping deleting this file. | |
If they are the same, it's safe to delete the file your comparing to. | |
author: Roy Sindre Norangshol <roy.sindre at norangshol .no> | |
LICENSE: public domain, | |
NB: This script contains cute shoulder dragons, in other words: | |
I take no responsibility for whatever damage | |
this script might cause blabalbal. | |
""" | |
import sys | |
import sh | |
from sh import find, ls, grep, cut, sed, awk, sha512sum, file, rm | |
my_dir = "/home/rockj/Maildir/" | |
#my_dir = "/home/rockj/Maildir/.notice.norskenettbutikker/" | |
maildirs=find(my_dir, "-type", "d", "-iname", "cur") | |
#import codecs | |
#import locale | |
#import sys | |
#locale.setlocale(locale.LC_ALL, '') | |
#lang, encoding = locale.getdefaultlocale() | |
#sys.stdout = codecs.getwriter(encoding)(sys.stdout) | |
def _get_body(file): | |
base_body = None | |
try: | |
base_body = unicode(sed('1,/^[ ]*$/d', file)) | |
except UnicodeDecodeError, e: | |
try: | |
base_body = unicode(sed('1,/^[ ]*$/d', file, _encoding='iso-8859-1')) | |
except UnicodeDecodeError, f: | |
print "Unicode decode error on: {0}".format(file) | |
print f | |
return base_body | |
def _wipe(maildir, mfiles): | |
print "Working on %s" % maildir | |
check = {} | |
found = 0 | |
for mfile in mfiles: | |
mfile = mfile.strip() | |
try: | |
message_ids = grep("^[Mm]essage-[iI][dD]", mfile) | |
for mid in message_ids: | |
mid = mid.strip() | |
if mid in check: | |
if not mfile in check[mid]: | |
check[mid].append(mfile) | |
else: | |
check[mid] = [mfile] | |
except sh.ErrorReturnCode_1: | |
print "[skip] No message id found for %s" % mfile | |
for mid,files in check.items(): | |
if len(files)>1: | |
base_file = files[0].strip() | |
#tmp = p_encoding.match(unicode(file("--mime", base_file))) | |
base_body = _get_body(base_file) | |
for file_n in files[1:]: | |
file_n = file_n.strip() | |
file_body = _get_body(file_n) | |
if base_body and file_body: | |
if base_body == file_body: | |
print "Base {0} equals {1} wiping file".format(base_file, file_n) | |
found += 1 | |
#rm("-f", file_n) | |
print "{0} wiped {1} duplicates".format(maildir, found) | |
return found | |
count_total = 0 | |
for maildir in maildirs: | |
maildir = maildir.strip() | |
mfiles=find(maildir, "-type", "f") | |
count_total += _wipe(maildir, mfiles) | |
print "Wiped total of {0} emails".format(count_total) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment