Created
September 30, 2016 15:46
-
-
Save wckdouglas/ec96a02b19cebc9b67f5b62d99569033 to your computer and use it in GitHub Desktop.
From apple mail database, finding all UTexas gsaf email and extract seqeuencing job number and sample
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| from HTMLParser import HTMLParseError | |
| import os | |
| def extractTable(soup): | |
| data = [] | |
| JOB_num = [word for word in soup.getText().split(' ') if word.startswith('JA')][1] | |
| JOB_num = JOB_num.strip('.').replace('\n>=20\n>','') | |
| table = soup.find('table') | |
| header = table.find_all('th') | |
| header = map(lambda h: h.text, header) | |
| data_frame = {h: [] for h in header} | |
| rows = soup.find_all('tr') | |
| for row in rows: | |
| try: | |
| tag_class = row['class'] | |
| except KeyError: | |
| data = map(lambda d: d.get_text().replace('=\n',''), row.find_all('td')) | |
| [data_frame[h].append(d) for h, d in zip(header, data)] | |
| number_of_sample = len(data_frame['SampleName']) | |
| [data_frame.pop(key) for key in data_frame.keys() if len(data_frame[key])!=number_of_sample] | |
| df = pd.DataFrame(data_frame)\ | |
| .assign(job_number = JOB_num) \ | |
| .pipe(lambda d: d[['job_number','SampleName']]) | |
| return df | |
| def readEmail(email_file): | |
| email = open(email_file).read() | |
| title_is_right = 'Submission has been approved and is now job' in email | |
| sender_is_right = 'gsaf' in email | |
| if title_is_right and sender_is_right: | |
| soup = BeautifulSoup(email,'html.parser') | |
| df = extractTable(soup) | |
| return df | |
| def main(): | |
| email_dir = '/Users/wckdouglas/Library/Mail' | |
| dfs = [] | |
| count = 0 | |
| for dpath, dnames, fnames in os.walk(email_dir): | |
| for filename in [os.path.join(dpath, fname) for fname in fnames]: | |
| if filename.endswith('emlx'): | |
| count += 1 | |
| dfs.append(readEmail(filename)) | |
| if count % 1000 == 0: | |
| print 'Parsed: %i emails' %count | |
| dfs = filter(lambda d: d is not None, dfs) | |
| df = pd.concat(dfs,axis=0) \ | |
| .reset_index() \ | |
| .drop(['index'],axis = 1) | |
| df.to_csv('job_number.tsv', index=False, sep='\t') | |
| print 'Written job_number.tsv' | |
| return 0 | |
| if __name__ == '__main__': | |
| main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment