magentanova · August 8, 2016 16:20
diff --git a/leadScraper.py b/leadScraper.py
 import re
 import sys
 import csv
 import time
 import mechanize
 from bs4 import BeautifulSoup as BS 

 # configuration
 opts = {
 	'email': '',
 	'password': '',
 	'csvFileName': 'leads.csv'
 }

 # write source of currently visited page. useful for debugging
 def writeResponse():
 	with open('temp.txt','w') as tempFile: 
 		tempFile.write(br.response().read())

 # parse command line args
 try:
 	cohortIds = sys.argv[1:]
 	if len(cohortIds) < 1:
 		raise Exception
 	opts['cohortIds'] = cohortIds
 except:
 	 print 'usage: python leadContacts.py [space-separated cohort ids]'
 	 exit(1)

 # useful regex patterns 
 leadP = re.compile('/admin/leads/(\d+)')
 phoneP = re.compile('tel:(.+)')
 emailP = re.compile('mailto:(.+)')
 statusP = re.compile('[applicant|accepted]:')

 # browser spoofing
 br = mechanize.Browser()
 br.set_handle_robots( False )
 br.addheaders = [
    ('User-agent', 'Firefox')
 ]

 # log in
 br.open('https://academy.theironyard.com')
 br.select_form(nr=0)
 br.form['user[email]'] = opts['email']
 br.form['user[password]'] = opts['password']
 br.submit()
 print 'logged in...'

 def writeContactInfo(csvRow,i): 
 	print 'recording info for student %s' % i
 	soup = BS(br.response().read())
 	for a in soup.findAll('a'):
 		if re.search(phoneP,a['href']): 
 			csvRow['phone'] = re.search(phoneP,a['href']).group(1)
 		if re.search(emailP,a['href']):
 			csvRow['email'] = re.search(emailP,a['href']).group(1)
 		if '/admin/cohorts/' in a['href']: 
 			csvRow['cohort'] = a.string
 	for span in soup.findAll('span',attrs={"class": "label"}):
 		if re.search(statusP,span.string):
 			csvRow['status'] = span.string
 	return csvRow

 # scrape a single cohort
 def scrapeCohort(cid): 
 	print 'scraping cohort with id %s' % cid
 	# accumulate all the lead ids. have to do things in two loops because ran into an 
 	# odd problem with the br.links() generator when the browser leaves the page.
 	br.open('https://academy.theironyard.com/admin/cohorts/%s' % cid)
 	leadURLs = []
 	for link in br.links():
 		if re.search(leadP,link.url):
 			leadURLs.append((link.text,'https://academy.theironyard.com/admin/leads/%s' % re.search(leadP,link.url).group(1)))
 	# visit all lead pages and get the contact info, writing into csv line by line.
 	i = 1
 	cohortRows = []
 	for tpl in leadURLs:
 		csvRow = {'name': tpl[0]}
 		br.open(tpl[1])
 		cohortRows.append(writeContactInfo(csvRow,i))
 		i += 1
 	return cohortRows

 # iterate over all input cohorts and scrape each one
 csvRows = []
 for cid in opts['cohortIds']: 
 	csvRows += scrapeCohort(cid)

 # write to csv
 with open(opts['csvFileName'], 'w') as outfile:
 	csvHandle = csv.DictWriter(outfile,fieldnames=['name','email','phone', 'status','cohort'])
 	csvHandle.writeheader()
 	for row in csvRows: 
 		csvHandle.writerow(row)

 print 'lead data written to %s' % opts['csvFileName']
	import re
	import sys
	import csv
	import time
	import mechanize
	from bs4 import BeautifulSoup as BS

	# configuration
	opts = {
	'email': '',
	'password': '',
	'csvFileName': 'leads.csv'
	}

	# write source of currently visited page. useful for debugging
	def writeResponse():
	with open('temp.txt','w') as tempFile:
	tempFile.write(br.response().read())

	# parse command line args
	try:
	cohortIds = sys.argv[1:]
	if len(cohortIds) < 1:
	raise Exception
	opts['cohortIds'] = cohortIds
	except:
	print 'usage: python leadContacts.py [space-separated cohort ids]'
	exit(1)

	# useful regex patterns
	leadP = re.compile('/admin/leads/(\d+)')
	phoneP = re.compile('tel:(.+)')
	emailP = re.compile('mailto:(.+)')
	statusP = re.compile('[applicant\|accepted]:')

	# browser spoofing
	br = mechanize.Browser()
	br.set_handle_robots( False )
	br.addheaders = [
	('User-agent', 'Firefox')
	]

	# log in
	br.open('https://academy.theironyard.com')
	br.select_form(nr=0)
	br.form['user[email]'] = opts['email']
	br.form['user[password]'] = opts['password']
	br.submit()
	print 'logged in...'

	def writeContactInfo(csvRow,i):
	print 'recording info for student %s' % i
	soup = BS(br.response().read())
	for a in soup.findAll('a'):
	if re.search(phoneP,a['href']):
	csvRow['phone'] = re.search(phoneP,a['href']).group(1)
	if re.search(emailP,a['href']):
	csvRow['email'] = re.search(emailP,a['href']).group(1)
	if '/admin/cohorts/' in a['href']:
	csvRow['cohort'] = a.string
	for span in soup.findAll('span',attrs={"class": "label"}):
	if re.search(statusP,span.string):
	csvRow['status'] = span.string
	return csvRow

	# scrape a single cohort
	def scrapeCohort(cid):
	print 'scraping cohort with id %s' % cid
	# accumulate all the lead ids. have to do things in two loops because ran into an
	# odd problem with the br.links() generator when the browser leaves the page.
	br.open('https://academy.theironyard.com/admin/cohorts/%s' % cid)
	leadURLs = []
	for link in br.links():
	if re.search(leadP,link.url):
	leadURLs.append((link.text,'https://academy.theironyard.com/admin/leads/%s' % re.search(leadP,link.url).group(1)))
	# visit all lead pages and get the contact info, writing into csv line by line.
	i = 1
	cohortRows = []
	for tpl in leadURLs:
	csvRow = {'name': tpl[0]}
	br.open(tpl[1])
	cohortRows.append(writeContactInfo(csvRow,i))
	i += 1
	return cohortRows

	# iterate over all input cohorts and scrape each one
	csvRows = []
	for cid in opts['cohortIds']:
	csvRows += scrapeCohort(cid)

	# write to csv
	with open(opts['csvFileName'], 'w') as outfile:
	csvHandle = csv.DictWriter(outfile,fieldnames=['name','email','phone', 'status','cohort'])
	csvHandle.writeheader()
	for row in csvRows:
	csvHandle.writerow(row)

	print 'lead data written to %s' % opts['csvFileName']