Created
March 1, 2018 22:00
-
-
Save tsl-lindsay/46fa475879b5f15e570e13c1a9598dd4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This sample code will get data from excel files that are sent as gmail attachments by scraping the embedded "sheet1.xml" files, if the files are formatted the same way. | |
# The data in each attachment (in this example, arrest records) will be saved a csv file in the "outpath" subdirectory. The csvs can later be combined using csvkit or another tool if so desired. | |
# Code originally written in Python 2.7 (sorry!) | |
import os | |
import re | |
import csv | |
import imaplib | |
import pyemail | |
import zipfile | |
from bs4 import BeautifulSoup | |
# replace EMAIL_ACCOUNT and EMAIL_PASSWORD | |
m = imaplib.IMAP4_SSL("imap.gmail.com") | |
m.login('[email protected]', 'EMAIL_PASSWORD') | |
m.select("SELECTION") # replace SELECTION with folder name to select emails in a specific folder | |
resp, items = m.search(None, "ALL") | |
# getting the mails id | |
items = items[0].split() | |
mainpath = 'email_attachments/' | |
attachpath = os.path.join(mainpath + '/attachments/') | |
outpath = os.path.join(mainpath + '/output/') | |
def createFolder(directory): | |
try: | |
if not os.path.exists(directory): | |
os.makedirs(directory) | |
except OSError: | |
print ('Error: Creating directory. ' + directory) | |
createFolder(attachpath) | |
createFolder(outpath) # this is where the csvs will be saved | |
for emailid in items[:2]: # for example, here are a couple of emails | |
# fetch the mail. "`(RFC822)`" means "get everything" | |
resp, data = m.fetch(emailid, "(RFC822)") | |
# get the mail content | |
email_body = data[0][1] | |
# parse the mail content | |
mail = pyemail.parse(email_body) | |
mail['id'] = emailid | |
for index, attachment in enumerate(mail.get("attachments")): | |
# make a new attachment record for each attachment | |
path = os.path.join('email_attachments', "%s-%s" % (emailid, index)) | |
out = open(path, 'w') | |
zipped = attachment.read() | |
out.write(zipped) | |
out.close() | |
# treat the excel file as a zip file and extract the contents | |
t = zipfile.ZipFile(path, 'r') | |
t.extractall(attachpath) | |
os.rename(os.path.join(attachpath, 'sheet1.xml'), "%s-%s" % (emailid, index)) | |
# now, use BeautifulSoup to "scrape" the xml and write to a csv | |
with open(os.path.join("%s-%s" % (emailid, index))) as infile: | |
blob = infile.read() | |
f = csv.writer(open(os.path.join(outpath, 'excel_file_%s-%s.csv' % (emailid, index)), 'w')) | |
f.writerow(['booking_num', 'booking_date', 'arrestee_address']) | |
soup = BeautifulSoup(blob, 'xml') | |
r = soup.find_all('row') | |
for row in r[3: ]: | |
booking_num = row.get_text().split('\n')[1] | |
booking_date = row.get_text().split('\n')[2] | |
arrestee_address = re.sub('\s+',' ',row.get_text().split("\n")[19]) # I'm only pulling out 3 fields here since this is just an example | |
f.writerow([booking_num, booking_date, arrestee_address]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment