Skip to content

Instantly share code, notes, and snippets.

@faried
Created February 9, 2011 21:37
Show Gist options
  • Save faried/819350 to your computer and use it in GitHub Desktop.
Save faried/819350 to your computer and use it in GitHub Desktop.
Parse out a few headers, store doc in db and then attach email attachments.
#!/usr/bin/env python
"""Store messages in a Maildir into a couchdb database."""
import couchdb
from email.header import decode_header
from email.utils import parseaddr, getaddresses
import logging
from mailbox import Maildir
from optparse import OptionParser
import os
from pprint import pprint
import sys
# from uuid import uuid4
__author__ = 'Faried Nawaz'
__email__ = '[email protected]'
__license__ = 'Modified BSD License'
__status__ = 'Prototype'
def utf8decode(thing):
"""Try to convert to Unicode.
TODO: try to decode using the message's charset instead
TODO: of blindly trying utf-8/iso-8859-15.
"""
try:
return thing.decode('utf-8')
except UnicodeDecodeError, _exc:
# many badly created messages will be like this
try:
return thing.decode('iso-8859-15')
except UnicodeDecodeError, exc:
# haven't run into this case yet. why not?
# do i need to try and parse more bad spam?
logging.error('failed to decode %s: %s', thing, str(exc))
sys.exit(1)
def rfc2047decode(name):
"""A few fields might be encoded with RFC 2047.
Use as
subject = rfc2047decode(message.get('Subject', ''))
Details at http://tools.ietf.org/html/rfc2047.html
Note: does not preserve whitespace. For example,
Re: [Loungers] =?utf-8?q?Korea?=
and
Re: [Loungers] =?utf-8?q?Korea?=
both decode to
Re: [Loungers] Korea
TODO: check if this behavior is appropriate.
"""
if name:
decodelist = decode_header(name)
retls = []
for pair in decodelist:
if pair[1]:
try:
dname = pair[0].decode(pair[1])
except UnicodeDecodeError, exc:
logging.warning('unable to decode %s as %s: %s',
pair[0], pair[1], str(exc))
retls.append(utf8decode(pair[0]))
else:
retls.append(dname)
else:
retls.append(utf8decode(pair[0]))
return u' '.join(retls)
return name
def getheaders(message):
"""Extract all message headers into a dict, encoded as UTF-8."""
rawheaders = {}
for header in message._headers:
if header[0] not in rawheaders:
rawheaders[header[0]] = [utf8decode(header[1])]
else:
rawheaders[header[0]].append(utf8decode(header[1]))
return rawheaders
def putattachments(database, doc, message):
"""Decode and attach mail attachments to the document."""
count = 0
for part in message.walk():
if part.get_content_maintype() != 'multipart':
ctype = part.get_content_type() or 'application/binary'
fname = part.get_filename() or 'attach-%08d' % count
if fname.find('\\') != -1:
fname = fname.replace('\\', '.')
if (fname.find('_') == 0 or fname.find('.') == 0 or
fname.find('/') == 0):
fname = 'FIXED' + fname
content = part.get_payload(decode=True) or ''
try:
database.put_attachment(doc, content,
filename=utf8decode(fname),
content_type=utf8decode(ctype))
except couchdb.client.ServerError, exc:
filename = doc['filename']
del database[doc['_id']]
print ctype, fname, len(content)
logging.error('cannot attach %s (%s) to %s: %s',
fname, ctype, filename, str(exc))
sys.exit(1)
count += 1
def insertmessages(database, maildir, verbose):
"""Put each maildir message in CouchDb."""
inserted = 0
ignored = 0
count = 0
for fname, message in maildir.iteritems():
if verbose:
logging.info('parsing %s', fname)
count += 1
# is it already in there?
doc = database.get(fname)
if doc:
logging.warning('%s already in db; skipping', fname)
ignored += 1
continue
# from and subject headers can be encoded using RFC 2047
# TODO: what if there's more than one From header?
# TODO: similarly with Subject.
(fromname, fromaddr) = parseaddr(message.get('From', ''))
fromname = rfc2047decode(fromname)
subject = rfc2047decode(message.get('Subject', ''))
# message metadata
data = {'type': 'email',
'rawheaders': getheaders(message),
'filename': fname,
'from': {'name': fromname, 'addr': fromaddr},
'subject': subject,
'message-id': utf8decode(message.get('message-id', '')),
'date': utf8decode(message.get('date', '')),
'flags': message.get_flags(),
# TODO: set folder names based on maildir names perhaps
'folders': ['INBOX']}
for header in ('delivered-to', 'in-reply-to', 'references'):
vals = message.get_all(header, [])
data[header] = [utf8decode(val) for val in vals]
for header in ('to', 'cc', 'bcc'):
data[header] = []
addresses = getaddresses(message.get_all(header, []))
for addr in addresses:
data[header].append({'addr': addr[0],
'name': rfc2047decode(addr[1])})
# doc_id = uuid4().hex
doc_id = fname
try:
database[doc_id] = data
# pass
except couchdb.client.ServerError, exc:
pprint(data)
logging.error('insertion exception for %s:, %s', fname, str(exc))
sys.exit(1)
else:
inserted += 1
if verbose:
logging.info('inserted %s', fname)
doc = database[doc_id]
putattachments(database, doc, message)
if count % 100 == 0:
logging.info('messages processed: %d', count)
logging.info('inserted messages: %d ignored: %d', inserted, ignored)
def main():
"""Main branching logic."""
home = os.environ.get('HOME', os.getcwd())
parser = OptionParser('usage: %prog -d dbname [other options]')
parser.add_option('-v', '--verbose', dest='verbose', action='store_true',
help='enable loquacious mode. like, duh.')
parser.add_option('-d', '--database', dest='database',
help='mail database in CouchDb [required]')
parser.add_option('-s', '--server', dest='server',
help='CouchDb URI [default: %default]',
default='http://localhost:5984/')
parser.add_option('-m', '--maildir', dest='maildir',
help='path to the maildir to import [default: %default]',
default='%s/Maildir/' % home)
(opts, _args) = parser.parse_args()
if not opts.database:
parser.error('a database name is required.')
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
if not (os.access(opts.maildir, os.F_OK) and
os.access(os.sep.join([opts.maildir, 'new']), os.F_OK)):
parser.error('cannot access maildir %s' % opts.maildir)
maildir = Maildir(opts.maildir, factory=None)
server = couchdb.Server(opts.server)
try:
_version = server.version
except AttributeError, _exc:
logging.error('cannot connect to %s', opts.server)
sys.exit(1)
try:
database = server[opts.database]
except couchdb.client.ResourceNotFound, _exc:
logging.error('invalid database name: %s', opts.database)
sys.exit(1)
logging.info('starting...')
insertmessages(database, maildir, opts.verbose)
if __name__ == '__main__':
main()
# eof
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment