Skip to content

Instantly share code, notes, and snippets.

@wckdouglas
Created September 30, 2016 15:46
Show Gist options
  • Save wckdouglas/ec96a02b19cebc9b67f5b62d99569033 to your computer and use it in GitHub Desktop.
Save wckdouglas/ec96a02b19cebc9b67f5b62d99569033 to your computer and use it in GitHub Desktop.
From apple mail database, finding all UTexas gsaf email and extract seqeuencing job number and sample
#!/usr/bin/env python
from bs4 import BeautifulSoup
import pandas as pd
from HTMLParser import HTMLParseError
import os
def extractTable(soup):
data = []
JOB_num = [word for word in soup.getText().split(' ') if word.startswith('JA')][1]
JOB_num = JOB_num.strip('.').replace('\n>=20\n>','')
table = soup.find('table')
header = table.find_all('th')
header = map(lambda h: h.text, header)
data_frame = {h: [] for h in header}
rows = soup.find_all('tr')
for row in rows:
try:
tag_class = row['class']
except KeyError:
data = map(lambda d: d.get_text().replace('=\n',''), row.find_all('td'))
[data_frame[h].append(d) for h, d in zip(header, data)]
number_of_sample = len(data_frame['SampleName'])
[data_frame.pop(key) for key in data_frame.keys() if len(data_frame[key])!=number_of_sample]
df = pd.DataFrame(data_frame)\
.assign(job_number = JOB_num) \
.pipe(lambda d: d[['job_number','SampleName']])
return df
def readEmail(email_file):
email = open(email_file).read()
title_is_right = 'Submission has been approved and is now job' in email
sender_is_right = 'gsaf' in email
if title_is_right and sender_is_right:
soup = BeautifulSoup(email,'html.parser')
df = extractTable(soup)
return df
def main():
email_dir = '/Users/wckdouglas/Library/Mail'
dfs = []
count = 0
for dpath, dnames, fnames in os.walk(email_dir):
for filename in [os.path.join(dpath, fname) for fname in fnames]:
if filename.endswith('emlx'):
count += 1
dfs.append(readEmail(filename))
if count % 1000 == 0:
print 'Parsed: %i emails' %count
dfs = filter(lambda d: d is not None, dfs)
df = pd.concat(dfs,axis=0) \
.reset_index() \
.drop(['index'],axis = 1)
df.to_csv('job_number.tsv', index=False, sep='\t')
print 'Written job_number.tsv'
return 0
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment