Skip to content

Instantly share code, notes, and snippets.

@mjbommar
Created April 21, 2012 12:35
Show Gist options
  • Save mjbommar/2436913 to your computer and use it in GitHub Desktop.
Save mjbommar/2436913 to your computer and use it in GitHub Desktop.
Generate AWS CloudSearch SDF from RFC822 email messages: Enron sample
import codecs
import email
import email.parser
import glob
import json
import os
import os.path
import sys
def parseFile(fileName):
'''
Parse a file and return a JSON SDF dict.
'''
# Open and parse file.
parser = email.parser.Parser()
message = parser.parse(codecs.open(fileName, 'r'))
# Build JSON data
jsonData = {}
jsonData['type'] = 'add'
jsonData['id'] = fileName.replace('data/maildir/', '').replace("/", "_").replace("-", "_").replace(".", "")
jsonData['version'] = 1
jsonData['lang'] = 'en'
jsonData['fields'] = {'date': message['date'],
'subject': message['subject'],
'from': message['from'],
'to': message['to'],
'content': message.get_payload()}
return jsonData
if __name__ == "__main__":
# Parse single-argument command line.
if len(sys.argv) != 2:
sys.stderr.write("Usage: generateSDF.py <rfc822 path expression>")
sys.exit(-1)
# Build and parse file list
fileExpression = sys.argv[1]
fileList = [fileName for fileName in glob.glob(fileExpression) if os.path.isfile(fileName)]
jsonDataList = map(parseFile, fileList)
print json.dumps(jsonDataList, indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment