Skip to content

Instantly share code, notes, and snippets.

@magentanova
Last active August 8, 2016 16:20
Show Gist options
  • Save magentanova/3e9d9796e6259e4f1eb0255c4913b0b5 to your computer and use it in GitHub Desktop.
Save magentanova/3e9d9796e6259e4f1eb0255c4913b0b5 to your computer and use it in GitHub Desktop.
Usage: `python leadScraper.py [cohortID1] [cohortID2] [etc...]`. Will scrape TIYA information on leads from each cohort and write to a csv. Be sure to fill in the `opts` under #configuration at the top of the script.
import re
import sys
import csv
import time
import mechanize
from bs4 import BeautifulSoup as BS
# configuration
opts = {
'email': '',
'password': '',
'csvFileName': 'leads.csv'
}
# write source of currently visited page. useful for debugging
def writeResponse():
with open('temp.txt','w') as tempFile:
tempFile.write(br.response().read())
# parse command line args
try:
cohortIds = sys.argv[1:]
if len(cohortIds) < 1:
raise Exception
opts['cohortIds'] = cohortIds
except:
print 'usage: python leadContacts.py [space-separated cohort ids]'
exit(1)
# useful regex patterns
leadP = re.compile('/admin/leads/(\d+)')
phoneP = re.compile('tel:(.+)')
emailP = re.compile('mailto:(.+)')
statusP = re.compile('[applicant|accepted]:')
# browser spoofing
br = mechanize.Browser()
br.set_handle_robots( False )
br.addheaders = [
('User-agent', 'Firefox')
]
# log in
br.open('https://academy.theironyard.com')
br.select_form(nr=0)
br.form['user[email]'] = opts['email']
br.form['user[password]'] = opts['password']
br.submit()
print 'logged in...'
def writeContactInfo(csvRow,i):
print 'recording info for student %s' % i
soup = BS(br.response().read())
for a in soup.findAll('a'):
if re.search(phoneP,a['href']):
csvRow['phone'] = re.search(phoneP,a['href']).group(1)
if re.search(emailP,a['href']):
csvRow['email'] = re.search(emailP,a['href']).group(1)
if '/admin/cohorts/' in a['href']:
csvRow['cohort'] = a.string
for span in soup.findAll('span',attrs={"class": "label"}):
if re.search(statusP,span.string):
csvRow['status'] = span.string
return csvRow
# scrape a single cohort
def scrapeCohort(cid):
print 'scraping cohort with id %s' % cid
# accumulate all the lead ids. have to do things in two loops because ran into an
# odd problem with the br.links() generator when the browser leaves the page.
br.open('https://academy.theironyard.com/admin/cohorts/%s' % cid)
leadURLs = []
for link in br.links():
if re.search(leadP,link.url):
leadURLs.append((link.text,'https://academy.theironyard.com/admin/leads/%s' % re.search(leadP,link.url).group(1)))
# visit all lead pages and get the contact info, writing into csv line by line.
i = 1
cohortRows = []
for tpl in leadURLs:
csvRow = {'name': tpl[0]}
br.open(tpl[1])
cohortRows.append(writeContactInfo(csvRow,i))
i += 1
return cohortRows
# iterate over all input cohorts and scrape each one
csvRows = []
for cid in opts['cohortIds']:
csvRows += scrapeCohort(cid)
# write to csv
with open(opts['csvFileName'], 'w') as outfile:
csvHandle = csv.DictWriter(outfile,fieldnames=['name','email','phone', 'status','cohort'])
csvHandle.writeheader()
for row in csvRows:
csvHandle.writerow(row)
print 'lead data written to %s' % opts['csvFileName']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment