Created
February 9, 2011 21:37
-
-
Save faried/819350 to your computer and use it in GitHub Desktop.
Parse out a few headers, store doc in db and then attach email attachments.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| """Store messages in a Maildir into a couchdb database.""" | |
| import couchdb | |
| from email.header import decode_header | |
| from email.utils import parseaddr, getaddresses | |
| import logging | |
| from mailbox import Maildir | |
| from optparse import OptionParser | |
| import os | |
| from pprint import pprint | |
| import sys | |
| # from uuid import uuid4 | |
| __author__ = 'Faried Nawaz' | |
| __email__ = '[email protected]' | |
| __license__ = 'Modified BSD License' | |
| __status__ = 'Prototype' | |
| def utf8decode(thing): | |
| """Try to convert to Unicode. | |
| TODO: try to decode using the message's charset instead | |
| TODO: of blindly trying utf-8/iso-8859-15. | |
| """ | |
| try: | |
| return thing.decode('utf-8') | |
| except UnicodeDecodeError, _exc: | |
| # many badly created messages will be like this | |
| try: | |
| return thing.decode('iso-8859-15') | |
| except UnicodeDecodeError, exc: | |
| # haven't run into this case yet. why not? | |
| # do i need to try and parse more bad spam? | |
| logging.error('failed to decode %s: %s', thing, str(exc)) | |
| sys.exit(1) | |
| def rfc2047decode(name): | |
| """A few fields might be encoded with RFC 2047. | |
| Use as | |
| subject = rfc2047decode(message.get('Subject', '')) | |
| Details at http://tools.ietf.org/html/rfc2047.html | |
| Note: does not preserve whitespace. For example, | |
| Re: [Loungers] =?utf-8?q?Korea?= | |
| and | |
| Re: [Loungers] =?utf-8?q?Korea?= | |
| both decode to | |
| Re: [Loungers] Korea | |
| TODO: check if this behavior is appropriate. | |
| """ | |
| if name: | |
| decodelist = decode_header(name) | |
| retls = [] | |
| for pair in decodelist: | |
| if pair[1]: | |
| try: | |
| dname = pair[0].decode(pair[1]) | |
| except UnicodeDecodeError, exc: | |
| logging.warning('unable to decode %s as %s: %s', | |
| pair[0], pair[1], str(exc)) | |
| retls.append(utf8decode(pair[0])) | |
| else: | |
| retls.append(dname) | |
| else: | |
| retls.append(utf8decode(pair[0])) | |
| return u' '.join(retls) | |
| return name | |
| def getheaders(message): | |
| """Extract all message headers into a dict, encoded as UTF-8.""" | |
| rawheaders = {} | |
| for header in message._headers: | |
| if header[0] not in rawheaders: | |
| rawheaders[header[0]] = [utf8decode(header[1])] | |
| else: | |
| rawheaders[header[0]].append(utf8decode(header[1])) | |
| return rawheaders | |
| def putattachments(database, doc, message): | |
| """Decode and attach mail attachments to the document.""" | |
| count = 0 | |
| for part in message.walk(): | |
| if part.get_content_maintype() != 'multipart': | |
| ctype = part.get_content_type() or 'application/binary' | |
| fname = part.get_filename() or 'attach-%08d' % count | |
| if fname.find('\\') != -1: | |
| fname = fname.replace('\\', '.') | |
| if (fname.find('_') == 0 or fname.find('.') == 0 or | |
| fname.find('/') == 0): | |
| fname = 'FIXED' + fname | |
| content = part.get_payload(decode=True) or '' | |
| try: | |
| database.put_attachment(doc, content, | |
| filename=utf8decode(fname), | |
| content_type=utf8decode(ctype)) | |
| except couchdb.client.ServerError, exc: | |
| filename = doc['filename'] | |
| del database[doc['_id']] | |
| print ctype, fname, len(content) | |
| logging.error('cannot attach %s (%s) to %s: %s', | |
| fname, ctype, filename, str(exc)) | |
| sys.exit(1) | |
| count += 1 | |
| def insertmessages(database, maildir, verbose): | |
| """Put each maildir message in CouchDb.""" | |
| inserted = 0 | |
| ignored = 0 | |
| count = 0 | |
| for fname, message in maildir.iteritems(): | |
| if verbose: | |
| logging.info('parsing %s', fname) | |
| count += 1 | |
| # is it already in there? | |
| doc = database.get(fname) | |
| if doc: | |
| logging.warning('%s already in db; skipping', fname) | |
| ignored += 1 | |
| continue | |
| # from and subject headers can be encoded using RFC 2047 | |
| # TODO: what if there's more than one From header? | |
| # TODO: similarly with Subject. | |
| (fromname, fromaddr) = parseaddr(message.get('From', '')) | |
| fromname = rfc2047decode(fromname) | |
| subject = rfc2047decode(message.get('Subject', '')) | |
| # message metadata | |
| data = {'type': 'email', | |
| 'rawheaders': getheaders(message), | |
| 'filename': fname, | |
| 'from': {'name': fromname, 'addr': fromaddr}, | |
| 'subject': subject, | |
| 'message-id': utf8decode(message.get('message-id', '')), | |
| 'date': utf8decode(message.get('date', '')), | |
| 'flags': message.get_flags(), | |
| # TODO: set folder names based on maildir names perhaps | |
| 'folders': ['INBOX']} | |
| for header in ('delivered-to', 'in-reply-to', 'references'): | |
| vals = message.get_all(header, []) | |
| data[header] = [utf8decode(val) for val in vals] | |
| for header in ('to', 'cc', 'bcc'): | |
| data[header] = [] | |
| addresses = getaddresses(message.get_all(header, [])) | |
| for addr in addresses: | |
| data[header].append({'addr': addr[0], | |
| 'name': rfc2047decode(addr[1])}) | |
| # doc_id = uuid4().hex | |
| doc_id = fname | |
| try: | |
| database[doc_id] = data | |
| # pass | |
| except couchdb.client.ServerError, exc: | |
| pprint(data) | |
| logging.error('insertion exception for %s:, %s', fname, str(exc)) | |
| sys.exit(1) | |
| else: | |
| inserted += 1 | |
| if verbose: | |
| logging.info('inserted %s', fname) | |
| doc = database[doc_id] | |
| putattachments(database, doc, message) | |
| if count % 100 == 0: | |
| logging.info('messages processed: %d', count) | |
| logging.info('inserted messages: %d ignored: %d', inserted, ignored) | |
| def main(): | |
| """Main branching logic.""" | |
| home = os.environ.get('HOME', os.getcwd()) | |
| parser = OptionParser('usage: %prog -d dbname [other options]') | |
| parser.add_option('-v', '--verbose', dest='verbose', action='store_true', | |
| help='enable loquacious mode. like, duh.') | |
| parser.add_option('-d', '--database', dest='database', | |
| help='mail database in CouchDb [required]') | |
| parser.add_option('-s', '--server', dest='server', | |
| help='CouchDb URI [default: %default]', | |
| default='http://localhost:5984/') | |
| parser.add_option('-m', '--maildir', dest='maildir', | |
| help='path to the maildir to import [default: %default]', | |
| default='%s/Maildir/' % home) | |
| (opts, _args) = parser.parse_args() | |
| if not opts.database: | |
| parser.error('a database name is required.') | |
| logging.basicConfig(level=logging.DEBUG, | |
| format='%(asctime)s %(levelname)s %(message)s') | |
| if not (os.access(opts.maildir, os.F_OK) and | |
| os.access(os.sep.join([opts.maildir, 'new']), os.F_OK)): | |
| parser.error('cannot access maildir %s' % opts.maildir) | |
| maildir = Maildir(opts.maildir, factory=None) | |
| server = couchdb.Server(opts.server) | |
| try: | |
| _version = server.version | |
| except AttributeError, _exc: | |
| logging.error('cannot connect to %s', opts.server) | |
| sys.exit(1) | |
| try: | |
| database = server[opts.database] | |
| except couchdb.client.ResourceNotFound, _exc: | |
| logging.error('invalid database name: %s', opts.database) | |
| sys.exit(1) | |
| logging.info('starting...') | |
| insertmessages(database, maildir, opts.verbose) | |
| if __name__ == '__main__': | |
| main() | |
| # eof |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment