tsl-lindsay · March 1, 2018 22:00
diff --git a/gistfile1.txt b/gistfile1.txt
 # This sample code will get data from excel files that are sent as gmail attachments by scraping the embedded "sheet1.xml" files, if the files are formatted the same way.

 # The data in each attachment (in this example, arrest records) will be saved a csv file in the "outpath" subdirectory. The csvs can later be combined using csvkit or another tool if so desired.

 # Code originally written in Python 2.7 (sorry!)

 import os
 import re
 import csv
 import imaplib
 import pyemail
 import zipfile
 from bs4 import BeautifulSoup

 # replace EMAIL_ACCOUNT and EMAIL_PASSWORD
 m = imaplib.IMAP4_SSL("imap.gmail.com")
 m.login('[email protected]', 'EMAIL_PASSWORD')
 m.select("SELECTION") # replace SELECTION with folder name to select emails in a specific folder
 resp, items = m.search(None, "ALL")

 # getting the mails id
 items = items[0].split()

 mainpath = 'email_attachments/'
 attachpath = os.path.join(mainpath + '/attachments/')
 outpath = os.path.join(mainpath + '/output/')

 def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

 createFolder(attachpath)
 createFolder(outpath) # this is where the csvs will be saved

 for emailid in items[:2]: # for example, here are a couple of emails
 # fetch the mail. "`(RFC822)`" means "get everything"
    resp, data = m.fetch(emailid, "(RFC822)")
    
 # get the mail content
    email_body = data[0][1]
    
 # parse the mail content
    mail = pyemail.parse(email_body)
    mail['id'] = emailid
    
    for index, attachment in enumerate(mail.get("attachments")):
        # make a new attachment record for each attachment
        path = os.path.join('email_attachments', "%s-%s" % (emailid, index))
        out = open(path, 'w')
        zipped = attachment.read()
        out.write(zipped)
        out.close()
        
        # treat the excel file as a zip file and extract the contents
        t = zipfile.ZipFile(path, 'r')
        t.extractall(attachpath)
        
        os.rename(os.path.join(attachpath, 'sheet1.xml'), "%s-%s" % (emailid, index))
        
        # now, use BeautifulSoup to "scrape" the xml and write to a csv
        with open(os.path.join("%s-%s" % (emailid, index))) as infile:
            blob = infile.read()
        
        f = csv.writer(open(os.path.join(outpath, 'excel_file_%s-%s.csv' % (emailid, index)), 'w'))
        f.writerow(['booking_num', 'booking_date', 'arrestee_address'])
    
        soup = BeautifulSoup(blob, 'xml')
        r = soup.find_all('row')
    
        for row in r[3: ]:
            booking_num = row.get_text().split('\n')[1]
            booking_date = row.get_text().split('\n')[2]
            arrestee_address = re.sub('\s+',' ',row.get_text().split("\n")[19]) # I'm only pulling out 3 fields here since this is just an example
    
            f.writerow([booking_num, booking_date, arrestee_address])
	# This sample code will get data from excel files that are sent as gmail attachments by scraping the embedded "sheet1.xml" files, if the files are formatted the same way.

	# The data in each attachment (in this example, arrest records) will be saved a csv file in the "outpath" subdirectory. The csvs can later be combined using csvkit or another tool if so desired.

	# Code originally written in Python 2.7 (sorry!)

	import os
	import re
	import csv
	import imaplib
	import pyemail
	import zipfile
	from bs4 import BeautifulSoup

	# replace EMAIL_ACCOUNT and EMAIL_PASSWORD
	m = imaplib.IMAP4_SSL("imap.gmail.com")
	m.login('[email protected]', 'EMAIL_PASSWORD')
	m.select("SELECTION") # replace SELECTION with folder name to select emails in a specific folder
	resp, items = m.search(None, "ALL")

	# getting the mails id
	items = items[0].split()

	mainpath = 'email_attachments/'
	attachpath = os.path.join(mainpath + '/attachments/')
	outpath = os.path.join(mainpath + '/output/')

	def createFolder(directory):
	try:
	if not os.path.exists(directory):
	os.makedirs(directory)
	except OSError:
	print ('Error: Creating directory. ' + directory)

	createFolder(attachpath)
	createFolder(outpath) # this is where the csvs will be saved

	for emailid in items[:2]: # for example, here are a couple of emails
	# fetch the mail. "`(RFC822)`" means "get everything"
	resp, data = m.fetch(emailid, "(RFC822)")

	# get the mail content
	email_body = data[0][1]

	# parse the mail content
	mail = pyemail.parse(email_body)
	mail['id'] = emailid

	for index, attachment in enumerate(mail.get("attachments")):
	# make a new attachment record for each attachment
	path = os.path.join('email_attachments', "%s-%s" % (emailid, index))
	out = open(path, 'w')
	zipped = attachment.read()
	out.write(zipped)
	out.close()

	# treat the excel file as a zip file and extract the contents
	t = zipfile.ZipFile(path, 'r')
	t.extractall(attachpath)

	os.rename(os.path.join(attachpath, 'sheet1.xml'), "%s-%s" % (emailid, index))

	# now, use BeautifulSoup to "scrape" the xml and write to a csv
	with open(os.path.join("%s-%s" % (emailid, index))) as infile:
	blob = infile.read()

	f = csv.writer(open(os.path.join(outpath, 'excel_file_%s-%s.csv' % (emailid, index)), 'w'))
	f.writerow(['booking_num', 'booking_date', 'arrestee_address'])

	soup = BeautifulSoup(blob, 'xml')
	r = soup.find_all('row')

	for row in r[3: ]:
	booking_num = row.get_text().split('\n')[1]
	booking_date = row.get_text().split('\n')[2]
	arrestee_address = re.sub('\s+',' ',row.get_text().split("\n")[19]) # I'm only pulling out 3 fields here since this is just an example

	f.writerow([booking_num, booking_date, arrestee_address])