Skip to content

Instantly share code, notes, and snippets.

@macloo
Created March 31, 2019 22:22
Show Gist options
  • Select an option

  • Save macloo/79d2b1649254184a4eb611eec09f04e3 to your computer and use it in GitHub Desktop.

Select an option

Save macloo/79d2b1649254184a4eb611eec09f04e3 to your computer and use it in GitHub Desktop.
Collect deets for each FL rep from 2 pages
import requests
from bs4 import BeautifulSoup
import time
base_url = "https://www.myfloridahouse.gov"
# using a small list for testing
small_list = [
"/Sections/Representatives/details.aspx?MemberId=4684&LegislativeTermId=88",
"/Sections/Representatives/details.aspx?MemberId=4624&LegislativeTermId=88",
"/Sections/Representatives/details.aspx?MemberId=4658&LegislativeTermId=88",
"/Sections/Representatives/details.aspx?MemberId=4686&LegislativeTermId=88"
]
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
# get rid of extra spaces
def clean(input):
return " ".join(str(input).split())
# this is the main function, loops through all reps
def get_all_reps():
for rep in small_list:
url = base_url + rep
session = requests.Session()
req = session.get(url, headers=hdr)
bs = BeautifulSoup(req.text, "html5lib")
# get name, district, contact_url - in a list
main_list = get_main_details(bs)
# get address, phone - in a list
contact_list = get_contact(main_list[2])
# print the rep to see if it worked - testing only
# not writing the csv yet
print( main_list[0], main_list[1], contact_list[0],
contact_list[1] )
# print a blank line
print()
# pause for 1 second each loop
time.sleep(1)
# stuff from one rep's main page
def get_main_details(bs):
# get contact page URL
contact = bs.find('div', {'id' : 'sidenav'})
contact_url = contact.find('a').attrs['href']
# get name and district
try:
name = bs.find('h1').get_text().strip()
name = clean(name)
except:
name = "None"
try:
district = bs.find('h2').find_next('h2').get_text().strip()
district = clean(district)
except:
district = "None"
return [name, district, contact_url]
# stuff from one rep's contact page
def get_contact(contact_url):
url = base_url + contact_url
session = requests.Session()
req = session.get(url, headers=hdr)
bs = BeautifulSoup(req.text, "html5lib")
# get address and phone number
try:
contact_information = bs.find('address')
contact_info = contact_information.find('ul').find('li')
address = contact_info.get_text().strip()
phone = contact_info.find_next('li').get_text()
address = clean(address)
phone = clean(phone)
except AttributeError:
address ='None'
phone ='None'
return [address, phone]
# run the main function
get_all_reps()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment