Skip to content

Instantly share code, notes, and snippets.

@higs4281
Last active December 21, 2015 19:59
Show Gist options
  • Save higs4281/6358296 to your computer and use it in GitHub Desktop.
Save higs4281/6358296 to your computer and use it in GitHub Desktop.
scraper
# scraper for joni james
# author: bill higgins, 8/27/2013
import csvkit
import requests
from bs4 import BeautifulSoup as bs
import datetime
starter = datetime.datetime.now()
outfile = "providers.csv"
base = "http://elcpinellas.net/"
list_base = "%sprovider_searchresults.php?action=search&YearAvailable=2013" % base
detail_base = "%sdirectory_details.php?ProviderID=" % base
soup = bs(requests.get(list_base).text)
entries = soup.find('div', attrs={'id': 'right-content'}).find('ul').findAll('li')
pdict = {}
for entry in entries:
ID = entry.a['href'].split('ProviderID=')[1]
pdict[ID] = {}
pd = pdict[ID]
pd['detail_page'] = "%s%s" % (detail_base, ID)
pd['name'] = entry.a.text
pd['address'] = str(entry).split('<br>')[1]
cityline = entry.findAll('br')[-1].text.split(', FL ')
pd['city'] = cityline[0]
pd['zip'] = cityline[1]
pd['id'] = ID
print "picked up %s records from the list page; now getting details" % len(pdict.keys())
for key in pdict:
DICT = pdict[key]
psoup = bs(requests.get(DICT['detail_page']).text)
tables = psoup.findAll('table')
bigt = tables[0]
for row in bigt.findAll('tr'):
cells = row.findAll('td')
if 'Contact Name' in cells[0].text:
DICT['contact'] = cells[1].input['value']
if 'Phone' in cells[0].text:
DICT['phone'] = cells[1].input['value']
if 'Hours 540' in cells[0].text:
DICT['hours 540'] = cells[1].input['value']
if 'Provider Type' in cells[0].text:
DICT['provider_type'] = cells[1].input['value']
if 'Fall' in cells[0].text:
DICT['Fall_readiness'] = cells[1].input['value']
if 'Summer' in cells[0].text:
DICT['Summer_readiness'] = cells[1].input['value']
if 'Hours Begin' in cells[0].text:
DICT['daily_begin_time'] = cells[1].input['value']
if 'Hours End' in cells[0].text:
DICT['daily_end_time'] = cells[1].input['value']
if 'Hours End' in cells[0].text:
DICT['daily_end_time'] = cells[1].input['value']
if 'Program Begins' in cells[0].text:
DICT['program_start_date'] = cells[1].input['value']
if 'Program Ends' in cells[0].text:
DICT['program_end_date'] = cells[1].input['value']
if 'Weekly Hours' in cells[0].text:
DICT['weekly_hours'] = cells[1].input['value']
# headers = pd.keys()
headers = [
'name',
'id',
'city',
'zip',
'address',
'detail_page',
'contact',
'phone',
'hours 540',
'provider_type',
'Fall_readiness',
'Summer_readiness',
'daily_begin_time',
'daily_end_time',
'program_start_date',
'program_end_date',
'weekly_hours'
]
with open(outfile, 'w') as f:
wr = csvkit.CSVKitWriter(f)
wr.writerow(headers)
for each in pdict:
PD = pdict[each]
wr.writerow([PD[key] for key in headers])
print "exported %s rows to %s; it took %s" % (len(pdict.keys()), outfile, (datetime.datetime.now()-starter))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment