wckdouglas · September 30, 2016 15:46
diff --git a/find_all_gsaf_job.py b/find_all_gsaf_job.py
 #!/usr/bin/env python

 from bs4 import BeautifulSoup
 import pandas as pd
 from HTMLParser import HTMLParseError
 import os

 def extractTable(soup):
    data = []
    JOB_num = [word for word in soup.getText().split(' ') if word.startswith('JA')][1]
    JOB_num = JOB_num.strip('.').replace('\n>=20\n>','')

    table = soup.find('table')
    header = table.find_all('th')
    header = map(lambda h: h.text, header)
    data_frame = {h: [] for h in header}

    rows = soup.find_all('tr')
    for row in rows:
        try: 
            tag_class = row['class']
        except KeyError:
            data = map(lambda d: d.get_text().replace('=\n',''), row.find_all('td'))
            [data_frame[h].append(d) for h, d in zip(header, data)]
            
    number_of_sample =  len(data_frame['SampleName'])
    [data_frame.pop(key) for key in data_frame.keys() if len(data_frame[key])!=number_of_sample]
    df = pd.DataFrame(data_frame)\
        .assign(job_number = JOB_num) \
        .pipe(lambda d: d[['job_number','SampleName']])
    return df

 def readEmail(email_file):
    email = open(email_file).read() 
    title_is_right = 'Submission has been approved and is now job' in email
    sender_is_right = 'gsaf' in email
    if title_is_right and sender_is_right:
        soup = BeautifulSoup(email,'html.parser')
        df = extractTable(soup)
        return df
    
 def main():
    email_dir = '/Users/wckdouglas/Library/Mail'
    dfs = []
    count = 0
    for  dpath, dnames, fnames in os.walk(email_dir):
        for filename in [os.path.join(dpath, fname) for fname in fnames]:
            if filename.endswith('emlx'):
                count += 1
                dfs.append(readEmail(filename))
                if count % 1000 == 0:
                    print 'Parsed: %i emails' %count
    dfs = filter(lambda d: d is not None, dfs)
    df = pd.concat(dfs,axis=0) \
        .reset_index() \
        .drop(['index'],axis = 1)
    df.to_csv('job_number.tsv', index=False, sep='\t')
    print 'Written job_number.tsv' 
    return 0
    
 if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	from bs4 import BeautifulSoup
	import pandas as pd
	from HTMLParser import HTMLParseError
	import os

	def extractTable(soup):
	data = []
	JOB_num = [word for word in soup.getText().split(' ') if word.startswith('JA')][1]
	JOB_num = JOB_num.strip('.').replace('\n>=20\n>','')

	table = soup.find('table')
	header = table.find_all('th')
	header = map(lambda h: h.text, header)
	data_frame = {h: [] for h in header}

	rows = soup.find_all('tr')
	for row in rows:
	try:
	tag_class = row['class']
	except KeyError:
	data = map(lambda d: d.get_text().replace('=\n',''), row.find_all('td'))
	[data_frame[h].append(d) for h, d in zip(header, data)]

	number_of_sample = len(data_frame['SampleName'])
	[data_frame.pop(key) for key in data_frame.keys() if len(data_frame[key])!=number_of_sample]
	df = pd.DataFrame(data_frame)\
	.assign(job_number = JOB_num) \
	.pipe(lambda d: d[['job_number','SampleName']])
	return df

	def readEmail(email_file):
	email = open(email_file).read()
	title_is_right = 'Submission has been approved and is now job' in email
	sender_is_right = 'gsaf' in email
	if title_is_right and sender_is_right:
	soup = BeautifulSoup(email,'html.parser')
	df = extractTable(soup)
	return df

	def main():
	email_dir = '/Users/wckdouglas/Library/Mail'
	dfs = []
	count = 0
	for dpath, dnames, fnames in os.walk(email_dir):
	for filename in [os.path.join(dpath, fname) for fname in fnames]:
	if filename.endswith('emlx'):
	count += 1
	dfs.append(readEmail(filename))
	if count % 1000 == 0:
	print 'Parsed: %i emails' %count
	dfs = filter(lambda d: d is not None, dfs)
	df = pd.concat(dfs,axis=0) \
	.reset_index() \
	.drop(['index'],axis = 1)
	df.to_csv('job_number.tsv', index=False, sep='\t')
	print 'Written job_number.tsv'
	return 0

	if __name__ == '__main__':
	main()