Skip to content

Instantly share code, notes, and snippets.

@norrs
Created September 18, 2012 23:42
Show Gist options
  • Save norrs/3746757 to your computer and use it in GitHub Desktop.
Save norrs/3746757 to your computer and use it in GitHub Desktop.
#!/home/rockj/pysh/bin/python
"""Remove duplicate emails
Script that tries as good as it can to delete duplicates based
on the 'message-id' mail header and diffing mail body of 'message-id'
duplicatse.
This script do the following:
Loops thru every mailfile, append em in a hash which is keyed on the
header 'message-id'. If multiple files is found with the same header,
it is appended to the list for the current key.
After this process is done, we loop thru and do some work on the keys
who have more then 1 entries, where we do the following:
Fetches the first mail and use it as a base to compare against the others.
Comparing is done by extracting the mail body and running diff against
each other bodys.
If the body is different, we will be skipping deleting this file.
If they are the same, it's safe to delete the file your comparing to.
author: Roy Sindre Norangshol <roy.sindre at norangshol .no>
LICENSE: public domain,
NB: This script contains cute shoulder dragons, in other words:
I take no responsibility for whatever damage
this script might cause blabalbal.
"""
import sys
import sh
from sh import find, ls, grep, cut, sed, awk, sha512sum, file, rm
my_dir = "/home/rockj/Maildir/"
#my_dir = "/home/rockj/Maildir/.notice.norskenettbutikker/"
maildirs=find(my_dir, "-type", "d", "-iname", "cur")
#import codecs
#import locale
#import sys
#locale.setlocale(locale.LC_ALL, '')
#lang, encoding = locale.getdefaultlocale()
#sys.stdout = codecs.getwriter(encoding)(sys.stdout)
def _get_body(file):
base_body = None
try:
base_body = unicode(sed('1,/^[ ]*$/d', file))
except UnicodeDecodeError, e:
try:
base_body = unicode(sed('1,/^[ ]*$/d', file, _encoding='iso-8859-1'))
except UnicodeDecodeError, f:
print "Unicode decode error on: {0}".format(file)
print f
return base_body
def _wipe(maildir, mfiles):
print "Working on %s" % maildir
check = {}
found = 0
for mfile in mfiles:
mfile = mfile.strip()
try:
message_ids = grep("^[Mm]essage-[iI][dD]", mfile)
for mid in message_ids:
mid = mid.strip()
if mid in check:
if not mfile in check[mid]:
check[mid].append(mfile)
else:
check[mid] = [mfile]
except sh.ErrorReturnCode_1:
print "[skip] No message id found for %s" % mfile
for mid,files in check.items():
if len(files)>1:
base_file = files[0].strip()
#tmp = p_encoding.match(unicode(file("--mime", base_file)))
base_body = _get_body(base_file)
for file_n in files[1:]:
file_n = file_n.strip()
file_body = _get_body(file_n)
if base_body and file_body:
if base_body == file_body:
print "Base {0} equals {1} wiping file".format(base_file, file_n)
found += 1
#rm("-f", file_n)
print "{0} wiped {1} duplicates".format(maildir, found)
return found
count_total = 0
for maildir in maildirs:
maildir = maildir.strip()
mfiles=find(maildir, "-type", "f")
count_total += _wipe(maildir, mfiles)
print "Wiped total of {0} emails".format(count_total)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment