Last active
August 8, 2016 16:20
-
-
Save magentanova/3e9d9796e6259e4f1eb0255c4913b0b5 to your computer and use it in GitHub Desktop.
Usage: `python leadScraper.py [cohortID1] [cohortID2] [etc...]`. Will scrape TIYA information on leads from each cohort and write to a csv. Be sure to fill in the `opts` under #configuration at the top of the script.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
import csv | |
import time | |
import mechanize | |
from bs4 import BeautifulSoup as BS | |
# configuration | |
opts = { | |
'email': '', | |
'password': '', | |
'csvFileName': 'leads.csv' | |
} | |
# write source of currently visited page. useful for debugging | |
def writeResponse(): | |
with open('temp.txt','w') as tempFile: | |
tempFile.write(br.response().read()) | |
# parse command line args | |
try: | |
cohortIds = sys.argv[1:] | |
if len(cohortIds) < 1: | |
raise Exception | |
opts['cohortIds'] = cohortIds | |
except: | |
print 'usage: python leadContacts.py [space-separated cohort ids]' | |
exit(1) | |
# useful regex patterns | |
leadP = re.compile('/admin/leads/(\d+)') | |
phoneP = re.compile('tel:(.+)') | |
emailP = re.compile('mailto:(.+)') | |
statusP = re.compile('[applicant|accepted]:') | |
# browser spoofing | |
br = mechanize.Browser() | |
br.set_handle_robots( False ) | |
br.addheaders = [ | |
('User-agent', 'Firefox') | |
] | |
# log in | |
br.open('https://academy.theironyard.com') | |
br.select_form(nr=0) | |
br.form['user[email]'] = opts['email'] | |
br.form['user[password]'] = opts['password'] | |
br.submit() | |
print 'logged in...' | |
def writeContactInfo(csvRow,i): | |
print 'recording info for student %s' % i | |
soup = BS(br.response().read()) | |
for a in soup.findAll('a'): | |
if re.search(phoneP,a['href']): | |
csvRow['phone'] = re.search(phoneP,a['href']).group(1) | |
if re.search(emailP,a['href']): | |
csvRow['email'] = re.search(emailP,a['href']).group(1) | |
if '/admin/cohorts/' in a['href']: | |
csvRow['cohort'] = a.string | |
for span in soup.findAll('span',attrs={"class": "label"}): | |
if re.search(statusP,span.string): | |
csvRow['status'] = span.string | |
return csvRow | |
# scrape a single cohort | |
def scrapeCohort(cid): | |
print 'scraping cohort with id %s' % cid | |
# accumulate all the lead ids. have to do things in two loops because ran into an | |
# odd problem with the br.links() generator when the browser leaves the page. | |
br.open('https://academy.theironyard.com/admin/cohorts/%s' % cid) | |
leadURLs = [] | |
for link in br.links(): | |
if re.search(leadP,link.url): | |
leadURLs.append((link.text,'https://academy.theironyard.com/admin/leads/%s' % re.search(leadP,link.url).group(1))) | |
# visit all lead pages and get the contact info, writing into csv line by line. | |
i = 1 | |
cohortRows = [] | |
for tpl in leadURLs: | |
csvRow = {'name': tpl[0]} | |
br.open(tpl[1]) | |
cohortRows.append(writeContactInfo(csvRow,i)) | |
i += 1 | |
return cohortRows | |
# iterate over all input cohorts and scrape each one | |
csvRows = [] | |
for cid in opts['cohortIds']: | |
csvRows += scrapeCohort(cid) | |
# write to csv | |
with open(opts['csvFileName'], 'w') as outfile: | |
csvHandle = csv.DictWriter(outfile,fieldnames=['name','email','phone', 'status','cohort']) | |
csvHandle.writeheader() | |
for row in csvRows: | |
csvHandle.writerow(row) | |
print 'lead data written to %s' % opts['csvFileName'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment