macloo · March 31, 2019 22:22
diff --git a/get_rep_deets.py b/get_rep_deets.py
 import requests
 from bs4 import BeautifulSoup
 import time

 base_url = "https://www.myfloridahouse.gov"

 # using a small list for testing
 small_list = [
    "/Sections/Representatives/details.aspx?MemberId=4684&LegislativeTermId=88",
    "/Sections/Representatives/details.aspx?MemberId=4624&LegislativeTermId=88",
    "/Sections/Representatives/details.aspx?MemberId=4658&LegislativeTermId=88",
    "/Sections/Representatives/details.aspx?MemberId=4686&LegislativeTermId=88"
 ]

 hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}

 # get rid of extra spaces
 def clean(input):
    return " ".join(str(input).split())

 # this is the main function, loops through all reps
 def get_all_reps():
    for rep in small_list:
        url = base_url + rep
        session = requests.Session()
        req = session.get(url, headers=hdr)
        bs = BeautifulSoup(req.text, "html5lib")
        # get name, district, contact_url - in a list
        main_list = get_main_details(bs)
        # get address, phone - in a list
        contact_list = get_contact(main_list[2])
        # print the rep to see if it worked - testing only
        # not writing the csv yet
        print( main_list[0], main_list[1], contact_list[0],
            contact_list[1] )
        # print a blank line
        print()
        # pause for 1 second each loop
        time.sleep(1)

 # stuff from one rep's main page
 def get_main_details(bs):
    # get contact page URL
    contact = bs.find('div', {'id' : 'sidenav'})
    contact_url = contact.find('a').attrs['href']
    # get name and district
    try:
        name = bs.find('h1').get_text().strip()
        name = clean(name)
    except:
        name = "None"
    try:
        district = bs.find('h2').find_next('h2').get_text().strip()
        district = clean(district)
    except:
        district = "None"
    return [name, district, contact_url]

 # stuff from one rep's contact page
 def get_contact(contact_url):
    url = base_url + contact_url
    session = requests.Session()
    req = session.get(url, headers=hdr)
    bs = BeautifulSoup(req.text, "html5lib")
    # get address and phone number
    try:
        contact_information = bs.find('address')
        contact_info = contact_information.find('ul').find('li')
        address = contact_info.get_text().strip()
        phone = contact_info.find_next('li').get_text()
        address = clean(address)
        phone = clean(phone)
    except AttributeError:
        address ='None'
        phone ='None'
    return [address, phone]

 # run the main function
 get_all_reps()
	import requests
	from bs4 import BeautifulSoup
	import time

	base_url = "https://www.myfloridahouse.gov"

	# using a small list for testing
	small_list = [
	"/Sections/Representatives/details.aspx?MemberId=4684&LegislativeTermId=88",
	"/Sections/Representatives/details.aspx?MemberId=4624&LegislativeTermId=88",
	"/Sections/Representatives/details.aspx?MemberId=4658&LegislativeTermId=88",
	"/Sections/Representatives/details.aspx?MemberId=4686&LegislativeTermId=88"
	]

	hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
	'Accept-Encoding': 'none',
	'Accept-Language': 'en-US,en;q=0.8',
	'Connection': 'keep-alive'}

	# get rid of extra spaces
	def clean(input):
	return " ".join(str(input).split())

	# this is the main function, loops through all reps
	def get_all_reps():
	for rep in small_list:
	url = base_url + rep
	session = requests.Session()
	req = session.get(url, headers=hdr)
	bs = BeautifulSoup(req.text, "html5lib")
	# get name, district, contact_url - in a list
	main_list = get_main_details(bs)
	# get address, phone - in a list
	contact_list = get_contact(main_list[2])
	# print the rep to see if it worked - testing only
	# not writing the csv yet
	print( main_list[0], main_list[1], contact_list[0],
	contact_list[1] )
	# print a blank line
	print()
	# pause for 1 second each loop
	time.sleep(1)

	# stuff from one rep's main page
	def get_main_details(bs):
	# get contact page URL
	contact = bs.find('div', {'id' : 'sidenav'})
	contact_url = contact.find('a').attrs['href']
	# get name and district
	try:
	name = bs.find('h1').get_text().strip()
	name = clean(name)
	except:
	name = "None"
	try:
	district = bs.find('h2').find_next('h2').get_text().strip()
	district = clean(district)
	except:
	district = "None"
	return [name, district, contact_url]

	# stuff from one rep's contact page
	def get_contact(contact_url):
	url = base_url + contact_url
	session = requests.Session()
	req = session.get(url, headers=hdr)
	bs = BeautifulSoup(req.text, "html5lib")
	# get address and phone number
	try:
	contact_information = bs.find('address')
	contact_info = contact_information.find('ul').find('li')
	address = contact_info.get_text().strip()
	phone = contact_info.find_next('li').get_text()
	address = clean(address)
	phone = clean(phone)
	except AttributeError:
	address ='None'
	phone ='None'
	return [address, phone]

	# run the main function
	get_all_reps()
No results found