Created
March 31, 2019 22:22
-
-
Save macloo/79d2b1649254184a4eb611eec09f04e3 to your computer and use it in GitHub Desktop.
Collect deets for each FL rep from 2 pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import time | |
| base_url = "https://www.myfloridahouse.gov" | |
| # using a small list for testing | |
| small_list = [ | |
| "/Sections/Representatives/details.aspx?MemberId=4684&LegislativeTermId=88", | |
| "/Sections/Representatives/details.aspx?MemberId=4624&LegislativeTermId=88", | |
| "/Sections/Representatives/details.aspx?MemberId=4658&LegislativeTermId=88", | |
| "/Sections/Representatives/details.aspx?MemberId=4686&LegislativeTermId=88" | |
| ] | |
| hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', | |
| 'Accept-Encoding': 'none', | |
| 'Accept-Language': 'en-US,en;q=0.8', | |
| 'Connection': 'keep-alive'} | |
| # get rid of extra spaces | |
| def clean(input): | |
| return " ".join(str(input).split()) | |
| # this is the main function, loops through all reps | |
| def get_all_reps(): | |
| for rep in small_list: | |
| url = base_url + rep | |
| session = requests.Session() | |
| req = session.get(url, headers=hdr) | |
| bs = BeautifulSoup(req.text, "html5lib") | |
| # get name, district, contact_url - in a list | |
| main_list = get_main_details(bs) | |
| # get address, phone - in a list | |
| contact_list = get_contact(main_list[2]) | |
| # print the rep to see if it worked - testing only | |
| # not writing the csv yet | |
| print( main_list[0], main_list[1], contact_list[0], | |
| contact_list[1] ) | |
| # print a blank line | |
| print() | |
| # pause for 1 second each loop | |
| time.sleep(1) | |
| # stuff from one rep's main page | |
| def get_main_details(bs): | |
| # get contact page URL | |
| contact = bs.find('div', {'id' : 'sidenav'}) | |
| contact_url = contact.find('a').attrs['href'] | |
| # get name and district | |
| try: | |
| name = bs.find('h1').get_text().strip() | |
| name = clean(name) | |
| except: | |
| name = "None" | |
| try: | |
| district = bs.find('h2').find_next('h2').get_text().strip() | |
| district = clean(district) | |
| except: | |
| district = "None" | |
| return [name, district, contact_url] | |
| # stuff from one rep's contact page | |
| def get_contact(contact_url): | |
| url = base_url + contact_url | |
| session = requests.Session() | |
| req = session.get(url, headers=hdr) | |
| bs = BeautifulSoup(req.text, "html5lib") | |
| # get address and phone number | |
| try: | |
| contact_information = bs.find('address') | |
| contact_info = contact_information.find('ul').find('li') | |
| address = contact_info.get_text().strip() | |
| phone = contact_info.find_next('li').get_text() | |
| address = clean(address) | |
| phone = clean(phone) | |
| except AttributeError: | |
| address ='None' | |
| phone ='None' | |
| return [address, phone] | |
| # run the main function | |
| get_all_reps() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment