Created
September 9, 2018 11:40
-
-
Save afm-sayem/b3904def820a09edb255e2efcb1c2a5a to your computer and use it in GitHub Desktop.
parse emails from files and separate the key components
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, re, email, sqlite3 | |
conn = sqlite3.connect('mails.db') | |
c = conn.cursor() | |
c.execute('''CREATE TABLE emails | |
(date text, frm text, recipient text, subject text, body text)''') | |
def cleanup(message): | |
clean_html = (re.sub('<[^<]+?>', '', data)).replace(' ', ' '); | |
return clean_html | |
def writedata(mail, body, table): | |
date = mail.get('Date') | |
frm = mail.get('From') | |
to = mail.get('In-Reply-To') | |
subject = mail.get('Subject') | |
table.execute('INSERT INTO emails VALUES (?,?,?,?,?)', (date, frm, to, subject, body)) | |
conn.commit() | |
with open('out.csv', 'w') as csvfile: | |
for filename in os.listdir(): | |
if filename.endswith("txt"): | |
with open(filename, 'r', encoding="ISO-8859-1") as f: | |
content = f.read() | |
msgs = content.split('=========================================================================') | |
for m in msgs: | |
mail = email.message_from_string(m.strip()) | |
if mail.is_multipart(): | |
for part in mail.walk(): | |
body = part.get_payload(decode=True) | |
if body is not None: | |
data = body.decode("ISO-8859-1") | |
data = cleanup(data) | |
writedata(mail, data, c) | |
else: | |
body = mail.get_payload(decode=True) | |
data = body.decode("ISO-8859-1") | |
data = cleanup(data) | |
writedata(mail, data, c) | |
else: | |
continue | |
conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment