higs4281 · December 21, 2015 19:59
diff --git a/gistfile1.txt b/gistfile1.txt
 # scraper for joni james
 # author: bill higgins, 8/27/2013

 import csvkit
 import requests
 from bs4 import BeautifulSoup as bs
 import datetime

 starter = datetime.datetime.now()

 outfile = "providers.csv"

 base = "http://elcpinellas.net/"
 list_base = "%sprovider_searchresults.php?action=search&YearAvailable=2013" % base
 detail_base = "%sdirectory_details.php?ProviderID=" % base
 soup = bs(requests.get(list_base).text)
 entries = soup.find('div', attrs={'id': 'right-content'}).find('ul').findAll('li')
 pdict = {}
 for entry in entries:
    ID = entry.a['href'].split('ProviderID=')[1]
    pdict[ID] = {}
    pd = pdict[ID]
    pd['detail_page'] = "%s%s" % (detail_base, ID)
    pd['name'] = entry.a.text
    pd['address'] = str(entry).split('<br>')[1]
    cityline = entry.findAll('br')[-1].text.split(', FL ')
    pd['city'] = cityline[0]
    pd['zip'] = cityline[1]
    pd['id'] = ID

 print "picked up %s records from the list page; now getting details" % len(pdict.keys())

 for key in pdict:
    DICT = pdict[key]
    psoup = bs(requests.get(DICT['detail_page']).text)
    tables = psoup.findAll('table')
    bigt = tables[0]
    for row in bigt.findAll('tr'):
        cells = row.findAll('td')
        if 'Contact Name' in cells[0].text:
                DICT['contact'] = cells[1].input['value']
        if 'Phone' in cells[0].text:
                DICT['phone'] = cells[1].input['value']
        if 'Hours 540' in cells[0].text:
                DICT['hours 540'] = cells[1].input['value']
        if 'Provider Type' in cells[0].text:
                DICT['provider_type'] = cells[1].input['value']
        if 'Fall' in cells[0].text:
                DICT['Fall_readiness'] = cells[1].input['value']
        if 'Summer' in cells[0].text:
                DICT['Summer_readiness'] = cells[1].input['value']
        if 'Hours Begin' in cells[0].text:
                DICT['daily_begin_time'] = cells[1].input['value']
        if 'Hours End' in cells[0].text:
                DICT['daily_end_time'] = cells[1].input['value']
        if 'Hours End' in cells[0].text:
                DICT['daily_end_time'] = cells[1].input['value']
        if 'Program Begins' in cells[0].text:
                DICT['program_start_date'] = cells[1].input['value']
        if 'Program Ends' in cells[0].text:
                DICT['program_end_date'] = cells[1].input['value']
        if 'Weekly Hours' in cells[0].text:
                DICT['weekly_hours'] = cells[1].input['value']

 # headers = pd.keys()
 headers = [
    'name', 
    'id', 
    'city', 
    'zip', 
    'address',
    'detail_page',
    'contact',
    'phone',
    'hours 540',
    'provider_type',
    'Fall_readiness',
    'Summer_readiness',
    'daily_begin_time',
    'daily_end_time',
    'program_start_date',
    'program_end_date',
    'weekly_hours'
    ]

 with open(outfile, 'w') as f:
    wr = csvkit.CSVKitWriter(f)
    wr.writerow(headers)
    for each in pdict:
        PD = pdict[each]
        wr.writerow([PD[key] for key in headers])
 print "exported %s rows to %s; it took %s" % (len(pdict.keys()), outfile, (datetime.datetime.now()-starter))
	# scraper for joni james
	# author: bill higgins, 8/27/2013

	import csvkit
	import requests
	from bs4 import BeautifulSoup as bs
	import datetime

	starter = datetime.datetime.now()

	outfile = "providers.csv"

	base = "http://elcpinellas.net/"
	list_base = "%sprovider_searchresults.php?action=search&YearAvailable=2013" % base
	detail_base = "%sdirectory_details.php?ProviderID=" % base
	soup = bs(requests.get(list_base).text)
	entries = soup.find('div', attrs={'id': 'right-content'}).find('ul').findAll('li')
	pdict = {}
	for entry in entries:
	ID = entry.a['href'].split('ProviderID=')[1]
	pdict[ID] = {}
	pd = pdict[ID]
	pd['detail_page'] = "%s%s" % (detail_base, ID)
	pd['name'] = entry.a.text
	pd['address'] = str(entry).split('<br>')[1]
	cityline = entry.findAll('br')[-1].text.split(', FL ')
	pd['city'] = cityline[0]
	pd['zip'] = cityline[1]
	pd['id'] = ID

	print "picked up %s records from the list page; now getting details" % len(pdict.keys())

	for key in pdict:
	DICT = pdict[key]
	psoup = bs(requests.get(DICT['detail_page']).text)
	tables = psoup.findAll('table')
	bigt = tables[0]
	for row in bigt.findAll('tr'):
	cells = row.findAll('td')
	if 'Contact Name' in cells[0].text:
	DICT['contact'] = cells[1].input['value']
	if 'Phone' in cells[0].text:
	DICT['phone'] = cells[1].input['value']
	if 'Hours 540' in cells[0].text:
	DICT['hours 540'] = cells[1].input['value']
	if 'Provider Type' in cells[0].text:
	DICT['provider_type'] = cells[1].input['value']
	if 'Fall' in cells[0].text:
	DICT['Fall_readiness'] = cells[1].input['value']
	if 'Summer' in cells[0].text:
	DICT['Summer_readiness'] = cells[1].input['value']
	if 'Hours Begin' in cells[0].text:
	DICT['daily_begin_time'] = cells[1].input['value']
	if 'Hours End' in cells[0].text:
	DICT['daily_end_time'] = cells[1].input['value']
	if 'Hours End' in cells[0].text:
	DICT['daily_end_time'] = cells[1].input['value']
	if 'Program Begins' in cells[0].text:
	DICT['program_start_date'] = cells[1].input['value']
	if 'Program Ends' in cells[0].text:
	DICT['program_end_date'] = cells[1].input['value']
	if 'Weekly Hours' in cells[0].text:
	DICT['weekly_hours'] = cells[1].input['value']

	# headers = pd.keys()
	headers = [
	'name',
	'id',
	'city',
	'zip',
	'address',
	'detail_page',
	'contact',
	'phone',
	'hours 540',
	'provider_type',
	'Fall_readiness',
	'Summer_readiness',
	'daily_begin_time',
	'daily_end_time',
	'program_start_date',
	'program_end_date',
	'weekly_hours'
	]

	with open(outfile, 'w') as f:
	wr = csvkit.CSVKitWriter(f)
	wr.writerow(headers)
	for each in pdict:
	PD = pdict[each]
	wr.writerow([PD[key] for key in headers])
	print "exported %s rows to %s; it took %s" % (len(pdict.keys()), outfile, (datetime.datetime.now()-starter))
No results found