Created
April 21, 2012 12:35
-
-
Save mjbommar/2436913 to your computer and use it in GitHub Desktop.
Generate AWS CloudSearch SDF from RFC822 email messages: Enron sample
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import codecs | |
import email | |
import email.parser | |
import glob | |
import json | |
import os | |
import os.path | |
import sys | |
def parseFile(fileName): | |
''' | |
Parse a file and return a JSON SDF dict. | |
''' | |
# Open and parse file. | |
parser = email.parser.Parser() | |
message = parser.parse(codecs.open(fileName, 'r')) | |
# Build JSON data | |
jsonData = {} | |
jsonData['type'] = 'add' | |
jsonData['id'] = fileName.replace('data/maildir/', '').replace("/", "_").replace("-", "_").replace(".", "") | |
jsonData['version'] = 1 | |
jsonData['lang'] = 'en' | |
jsonData['fields'] = {'date': message['date'], | |
'subject': message['subject'], | |
'from': message['from'], | |
'to': message['to'], | |
'content': message.get_payload()} | |
return jsonData | |
if __name__ == "__main__": | |
# Parse single-argument command line. | |
if len(sys.argv) != 2: | |
sys.stderr.write("Usage: generateSDF.py <rfc822 path expression>") | |
sys.exit(-1) | |
# Build and parse file list | |
fileExpression = sys.argv[1] | |
fileList = [fileName for fileName in glob.glob(fileExpression) if os.path.isfile(fileName)] | |
jsonDataList = map(parseFile, fileList) | |
print json.dumps(jsonDataList, indent=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment