dannguyen · August 23, 2022 05:21
diff --git a/_house-public-disc-scraper-README.md b/_house-public-disc-scraper-README.md
diff --git a/house-public-disc-simple-search.py b/house-public-disc-simple-search.py
 """
 house-public-disc-simple-search.py
 gist: https://gist.github.com/dannguyen/994bfe5a4a1e9ba6c73f21046e31e86c
 Simple scraper of the ASPX search form for U.S. Congress House financial disclosure results
 ---------------

 The following script, given someone's last name, prints a CSV of
 financial disclosure PDFs (the first 20, for simplicity's sake) as
 found on the House Financial Disclosure Reports:

 http://clerk.house.gov/public_disc/financial-search.aspx

 Command-line usage:

    $ house-public-disc-simple-search.py king


 Background
 ----------

 This is an example of how to use plain ol' requests and html parsing
 to scrape an ASPX site that does **not** preserve state
 via cookies or something AJAXy.


 We can deal with these (basically) stateless sites by first performing a
 get request, and getting the HTML form. The state values are directly
 encoded into the HTML, e.g.

    <form name="aspnetForm" method="post" action="./financial-search.aspx" id="aspnetForm">
     <input type="hidden" name="ctl00_cphMain_ToolkitScriptManager1_HiddenField" id="ctl00_cphMain_ToolkitScriptManager1_HiddenField" value="">
     <input id="ctl00_cphMain_tabCont_ClientState" name="ctl00_cphMain_tabCont_ClientState" type="hidden" value='{"ActiveTabIndex":0,"TabState":[true,true,true]}'/>,
     <input id="__VIEWSTATEGENERATOR" name="__VIEWSTATEGENERATOR" type="hidden" value="43F3FF2D"/>,
     ...
    </form>

 We can use BeautifulSoup/lxml to scrape these hardcoded values and fill populate a
 dict, which we then send via requests.post

 The only trick is that you need to manually visit the site and do a query
 the old-fashioned way, with the devtools Network panel open. This allows you
 to see the payload sent by the POST request. In the case of the House Financial Disclosure
 Reports site, we have to make sure:

 - to send a last name value (e.g. 'King') for the `ctl00$cphMain$txbLast_nm` param
 - to *not* submit a value for `ctl00$cphMain$btnClear` param
 """

 # http://docs.python-requests.org/en/master/user/advanced/


 from bs4 import BeautifulSoup
 from csv import DictWriter
 import re
 import requests
 from sys import argv, stdout, stderr
 from urllib.parse import urljoin

 SEARCH_URL = 'http://clerk.house.gov/public_disc/financial-search.aspx'
 TABLE_FIELDNAMES = ['name', 'office', 'filing_year', 'filing']

 def _clean_text(txt):
    return re.sub(r'\s+', ' ', txt).strip()


 def _search_by_lastname(last_name):
    get_resp = requests.get(SEARCH_URL)
    soup = BeautifulSoup(get_resp.text, 'lxml')
    form = soup.select_one('form#aspnetForm')


    # get all the input/select elements
    inputs = [i for i in form.select('input, select')]

    # create a dict that contains the default values for the input fields, as
    #   they currently exist on the form. This is just a quickie way of populating
    #   all the annoying fields like '__VIEWSTATE' and '__EVENTVALIDATION'
    myparams = {i['name']: i.get('value') or '' for i in inputs}

    # We *don't* want to submit the values for the Clear button
    myparams.pop('ctl00$cphMain$btnClear')

    # finally, we manually set the last name field
    myparams['ctl00$cphMain$txbLast_nm'] = last_name

    # submit the post request
    post_resp = requests.post(SEARCH_URL, data=myparams)
    return post_resp


 def _parse_results(resp):
    # scrape the first table of results (1 through 20)
    soup = BeautifulSoup(resp.text, 'lxml')
    rows = soup.select('#search_results tr')[1:-1]
    results = []
    for row in rows:
        cells = row.select('td')
        d = {name: _clean_text(cells[idx].text) for idx, name in enumerate(TABLE_FIELDNAMES)}
        d['url'] = urljoin(SEARCH_URL, row.select('a')[0]['href'])
        results.append(d)

    return results

 def scrape_by_last_name(last_name):
    resp = _search_by_lastname(last_name)
    data = _parse_results(resp)
    return data


 if __name__ == '__main__':
    # assume first argument is the "last name"
    last_name = argv[1]
    stderr.write("Searching for last name of: `{}` ...\n".format(last_name))

    # output to stdout in CSV form
    outs = DictWriter(stdout, fieldnames=TABLE_FIELDNAMES + ['url'])
    outs.writeheader()
    outs.writerows(scrape_by_last_name(last_name))

diff --git a/sample-form-data-post-response.txt b/sample-form-data-post-response.txt
 ctl00_cphMain_ToolkitScriptManager1_HiddenField: ;;AjaxControlToolkit, Version=3.5.40412.0, Culture=neutral, PublicKeyToken=28f01b0e84b6d53e:en-US:1547e793-5b7e-48fe-8490-03a375b13a33:de1feab2:f9cec9bc:a0b0f951:a67c2700
 ctl00_cphMain_tabCont_ClientState: {"ActiveTabIndex":0,"TabState":[true,true]}
 __EVENTTARGET: 
 __EVENTARGUMENT: 
 __VIEWSTATE: NWh3sI46WtmQT7PcIfjKWkyKp30kiLIjCvA1GfKwwi6cRhc+3Op+6Yo+idaDWgk5rd1GUT8ZvS/RxP9iQs1C/YvLInRFlZQX1bSlFfaW3HkWKpfI4DAYtVht4EXQDDcGrYbrVhLfteWjWR7a/QR+sdJHC4Ls98DRT/OtsyAsfRQYqrmH9tRKdl3m1XSksS/HswO3YgN75WbDRyuKGn9nMjeRaQ8Xgzm182Vwc9xdqHQSWLZ/V/eMbHBDG7G6E6DDcGZ7DPsna3sZipz4aLHZcpwl9WGzy6i5P/2H6bm7TpEE3m5KIeWl/elx46G3FS00WL5bIeMcIjDjtjSeEpr0543QPKjozpd1e3cKL2Mtm9L2dPmctDo6TYY+8OppWI7D01KHn+Jd87+5So0Gc7Lm++GpLqz13Ffr7s5lJDZDQ5rE2gB+wPdHDbUXaHpm/BYfTaU+FjzDGNGABhuYQrp9P7tSGJR+wLOcDUoq+v736LUHeFGZF2GNz6ZY/omvaaMk4+dGka29RToOXtbrR532pE+dcbgiJlM9qzmAP1wLlnQ7OtpWHoJwsRqoqjhHxXkAgshykqbPgHKO0GPOnSs2Mcd6l1kWwrbPdSuBZtmoGkN0pbS9v9s9l0iAnNlNKOm4RezU+V8ryxzhrKo/JOMW5oQBoMMscu8sAzAXq8gVVbpx6FzVLIb4C5qTj1WiwlGp7K2u8G1L4VgdoKoOWwHaifO9/7aDJ435AIvZ0CR0DM5TIZQ0xnU53KaIJ6ZIYBoBdyNRRkLZ4fw3ciTe2LmWwKFfoVDh+0e8btlXtQ+lf4pVZ3f/
 __VIEWSTATEGENERATOR: 43F3FF2D
 __SCROLLPOSITIONX: 0
 __SCROLLPOSITIONY: 0
 __VIEWSTATEENCRYPTED: 
 __EVENTVALIDATION: VYqMycNqnGPNu8K4XomrNa1ma55Xo+1P6YtUnwDXDUuESZZHEoQxsI2i80UOog+bSlKGuEpg/UkrXZnZA30YvkNygn5D6MXk++WeJAQZ2isX5pqtQ2J5cjf7aILqkiG8mNgyGSA8pibD4fsEzxn/6QfEwjQTwdl2afjLQk1SEACQf3++iYcmTxUpsyYByA/tpKDMv6WhMpCADMOC6kDXO/Knyvphl2EaxyLC77JBJ+35PAxa9wnMe28zPKMRpE4FxQ94M+O7zPtXyGONPZOAr/OMT0xT3LuufsYmyuDn22mcN4pQxPsPw8pJg9JdQoR+rRFSxjlrdYRIfJ8vcot+csjqB6fZ+DYBTJKvKNb7MmCRrI4PdNhdZVQZnyCa3W/ZZWbvDKuOGUs6Fvem58PAUH2w09zGf7D3GGd4YAA3t+rq8XMYkRViH61o9+2i8jr5tKDNsZvs+KHd0JEEO3uoJFeQscTWnOKQyeA2mFqZSI069yajcYU78YDKiplFLOn3JX99mtGbk7FUYaH0zqYdOJqHTxmDcf3m5d8uR/+XEMURV798QMLL3V42WvB2K3QaOYpqpQmc8GvWATLTfxsWYyxo+amrPtHcBpY3JGm+4TAhFkM+0ukqXKsq0AnlAGO/3OPYfJvWZvHB9VqFNewa+32wGXiGgFva+yCYiQHGKUFDIA2r3/MTUUxx8tIorDcjnmXU5pZYSVv03+gmtMHoErpxR4DMFcZon4U8LDg+JUHDmmPAykwYMZMdCSTwM5+7CxY17IGezTXmZg63lUmP0aS/dt3b4K3PmFDhJAyAz7PibSrx5tiUN2kM/qfWK1kc2+p2wnNtPnKp/Un8lDq19N4Vw3NpGRqB6B3Fv6EdnOt3CTi46jfR//u3GGnduTEESa47Dh7cF4pdmoryZbjvKp3nh2O0dwm6FGjJV6vch/uZ+QASA14uhnePFGsKeNbqSLz2/6Q/me2gj/f1FcpORbqxULEa/TN34PhjsaFeSwwUyOm6ez/rKETTVWoRdUSBTR8QUSrppCyvapSXgRRwZwH/PaWJ6fHb23mtQFUgjt/5c169M+Fy06AJMCkPeBh+E4V4BG0AAOXKptt3hZ+//8fzaEKF9F96DVHgu2RSrQ6i7HI7o7rJ/SHg4djCnsV6iYEEi1+o8z7P3hpxWmoY6iE8pb6Zmbrqp7xRC6SWuvRK6tSVdP+QuoUyIBG3O65rNEJsb2bps0dsMrA+oKa/4gidb+2N68obIH5OFU1hIphzy0xkkLx80VRiOVHRl1Y4yVvUlSVVj53xGohH7+LXvSEGgGC1uwMZ8xQKRKjvqBfiQC9Q4TLIs5ZYUCAWt/KtN7kNdYKT+MDGVihLYF6kcVBKGdPGmabXV/NhHQ1fCdemvrGuUrUABKjFx6mAplpOAnHqhDgQdc2RnlFcJfOWIofl4N4jBFVqo3bcrjV2d6HPyWjsNDWcmMAu2Xre+EBZ49y7g20Jn4ShctbU727UY/EloGvTYuBamqunuww//9HUDuFtu91hntE2NWjKZUpWmt1WYIfJEmOBNQMXrjRJBl6YSU5Oe03YhVKmOF3MyZTxfgAtxyVzp+Uo8aBVOl+Z50p6doSGbX7EUnmH0ciAR0xI89cTD0Eu4S64fBxPIgLMBqJhfr7a+yNVhrfdCs7/YTW+xHddqTqB3G5xQqkk66NFvdYLnVP46yf7E5Dtf4/GJc9jMZMzcRud6dWRx3sgs1hNZkpW+qsub2IRMS7odHwGJtI=
 ctl00$txbSearchBox: 
 ctl00$cphMain$txbLast_nm: king
 ctl00$cphMain$txbFiling_year: 
 ctl00$cphMain$ddlState: 
 ctl00$cphMain$txbDistrict_cd: 
 ctl00$cphMain$btnSearch: Search
	"""
	house-public-disc-simple-search.py
	gist: https://gist.github.com/dannguyen/994bfe5a4a1e9ba6c73f21046e31e86c
	Simple scraper of the ASPX search form for U.S. Congress House financial disclosure results
	---------------

	The following script, given someone's last name, prints a CSV of
	financial disclosure PDFs (the first 20, for simplicity's sake) as
	found on the House Financial Disclosure Reports:

	http://clerk.house.gov/public_disc/financial-search.aspx

	Command-line usage:

	$ house-public-disc-simple-search.py king


	Background
	----------

	This is an example of how to use plain ol' requests and html parsing
	to scrape an ASPX site that does not preserve state
	via cookies or something AJAXy.


	We can deal with these (basically) stateless sites by first performing a
	get request, and getting the HTML form. The state values are directly
	encoded into the HTML, e.g.

	<form name="aspnetForm" method="post" action="./financial-search.aspx" id="aspnetForm">
	<input type="hidden" name="ctl00_cphMain_ToolkitScriptManager1_HiddenField" id="ctl00_cphMain_ToolkitScriptManager1_HiddenField" value="">
	<input id="ctl00_cphMain_tabCont_ClientState" name="ctl00_cphMain_tabCont_ClientState" type="hidden" value='{"ActiveTabIndex":0,"TabState":[true,true,true]}'/>,
	<input id="__VIEWSTATEGENERATOR" name="__VIEWSTATEGENERATOR" type="hidden" value="43F3FF2D"/>,
	...
	</form>

	We can use BeautifulSoup/lxml to scrape these hardcoded values and fill populate a
	dict, which we then send via requests.post

	The only trick is that you need to manually visit the site and do a query
	the old-fashioned way, with the devtools Network panel open. This allows you
	to see the payload sent by the POST request. In the case of the House Financial Disclosure
	Reports site, we have to make sure:

	- to send a last name value (e.g. 'King') for the `ctl00$cphMain$txbLast_nm` param
	- to not submit a value for `ctl00$cphMain$btnClear` param
	"""

	# http://docs.python-requests.org/en/master/user/advanced/


	from bs4 import BeautifulSoup
	from csv import DictWriter
	import re
	import requests
	from sys import argv, stdout, stderr
	from urllib.parse import urljoin

	SEARCH_URL = 'http://clerk.house.gov/public_disc/financial-search.aspx'
	TABLE_FIELDNAMES = ['name', 'office', 'filing_year', 'filing']

	def _clean_text(txt):
	return re.sub(r'\s+', ' ', txt).strip()


	def _search_by_lastname(last_name):
	get_resp = requests.get(SEARCH_URL)
	soup = BeautifulSoup(get_resp.text, 'lxml')
	form = soup.select_one('form#aspnetForm')


	# get all the input/select elements
	inputs = [i for i in form.select('input, select')]

	# create a dict that contains the default values for the input fields, as
	# they currently exist on the form. This is just a quickie way of populating
	# all the annoying fields like '__VIEWSTATE' and '__EVENTVALIDATION'
	myparams = {i['name']: i.get('value') or '' for i in inputs}

	# We don't want to submit the values for the Clear button
	myparams.pop('ctl00$cphMain$btnClear')

	# finally, we manually set the last name field
	myparams['ctl00$cphMain$txbLast_nm'] = last_name

	# submit the post request
	post_resp = requests.post(SEARCH_URL, data=myparams)
	return post_resp


	def _parse_results(resp):
	# scrape the first table of results (1 through 20)
	soup = BeautifulSoup(resp.text, 'lxml')
	rows = soup.select('#search_results tr')[1:-1]
	results = []
	for row in rows:
	cells = row.select('td')
	d = {name: _clean_text(cells[idx].text) for idx, name in enumerate(TABLE_FIELDNAMES)}
	d['url'] = urljoin(SEARCH_URL, row.select('a')[0]['href'])
	results.append(d)

	return results

	def scrape_by_last_name(last_name):
	resp = _search_by_lastname(last_name)
	data = _parse_results(resp)
	return data


	if __name__ == '__main__':
	# assume first argument is the "last name"
	last_name = argv[1]
	stderr.write("Searching for last name of: `{}` ...\n".format(last_name))

	# output to stdout in CSV form
	outs = DictWriter(stdout, fieldnames=TABLE_FIELDNAMES + ['url'])
	outs.writeheader()
	outs.writerows(scrape_by_last_name(last_name))
	ctl00_cphMain_ToolkitScriptManager1_HiddenField: ;;AjaxControlToolkit, Version=3.5.40412.0, Culture=neutral, PublicKeyToken=28f01b0e84b6d53e:en-US:1547e793-5b7e-48fe-8490-03a375b13a33:de1feab2:f9cec9bc:a0b0f951:a67c2700
	ctl00_cphMain_tabCont_ClientState: {"ActiveTabIndex":0,"TabState":[true,true]}
	__EVENTTARGET:
	__EVENTARGUMENT:
	__VIEWSTATE: NWh3sI46WtmQT7PcIfjKWkyKp30kiLIjCvA1GfKwwi6cRhc+3Op+6Yo+idaDWgk5rd1GUT8ZvS/RxP9iQs1C/YvLInRFlZQX1bSlFfaW3HkWKpfI4DAYtVht4EXQDDcGrYbrVhLfteWjWR7a/QR+sdJHC4Ls98DRT/OtsyAsfRQYqrmH9tRKdl3m1XSksS/HswO3YgN75WbDRyuKGn9nMjeRaQ8Xgzm182Vwc9xdqHQSWLZ/V/eMbHBDG7G6E6DDcGZ7DPsna3sZipz4aLHZcpwl9WGzy6i5P/2H6bm7TpEE3m5KIeWl/elx46G3FS00WL5bIeMcIjDjtjSeEpr0543QPKjozpd1e3cKL2Mtm9L2dPmctDo6TYY+8OppWI7D01KHn+Jd87+5So0Gc7Lm++GpLqz13Ffr7s5lJDZDQ5rE2gB+wPdHDbUXaHpm/BYfTaU+FjzDGNGABhuYQrp9P7tSGJR+wLOcDUoq+v736LUHeFGZF2GNz6ZY/omvaaMk4+dGka29RToOXtbrR532pE+dcbgiJlM9qzmAP1wLlnQ7OtpWHoJwsRqoqjhHxXkAgshykqbPgHKO0GPOnSs2Mcd6l1kWwrbPdSuBZtmoGkN0pbS9v9s9l0iAnNlNKOm4RezU+V8ryxzhrKo/JOMW5oQBoMMscu8sAzAXq8gVVbpx6FzVLIb4C5qTj1WiwlGp7K2u8G1L4VgdoKoOWwHaifO9/7aDJ435AIvZ0CR0DM5TIZQ0xnU53KaIJ6ZIYBoBdyNRRkLZ4fw3ciTe2LmWwKFfoVDh+0e8btlXtQ+lf4pVZ3f/
	__VIEWSTATEGENERATOR: 43F3FF2D
	__SCROLLPOSITIONX: 0
	__SCROLLPOSITIONY: 0
	__VIEWSTATEENCRYPTED:
	__EVENTVALIDATION: VYqMycNqnGPNu8K4XomrNa1ma55Xo+1P6YtUnwDXDUuESZZHEoQxsI2i80UOog+bSlKGuEpg/UkrXZnZA30YvkNygn5D6MXk++WeJAQZ2isX5pqtQ2J5cjf7aILqkiG8mNgyGSA8pibD4fsEzxn/6QfEwjQTwdl2afjLQk1SEACQf3++iYcmTxUpsyYByA/tpKDMv6WhMpCADMOC6kDXO/Knyvphl2EaxyLC77JBJ+35PAxa9wnMe28zPKMRpE4FxQ94M+O7zPtXyGONPZOAr/OMT0xT3LuufsYmyuDn22mcN4pQxPsPw8pJg9JdQoR+rRFSxjlrdYRIfJ8vcot+csjqB6fZ+DYBTJKvKNb7MmCRrI4PdNhdZVQZnyCa3W/ZZWbvDKuOGUs6Fvem58PAUH2w09zGf7D3GGd4YAA3t+rq8XMYkRViH61o9+2i8jr5tKDNsZvs+KHd0JEEO3uoJFeQscTWnOKQyeA2mFqZSI069yajcYU78YDKiplFLOn3JX99mtGbk7FUYaH0zqYdOJqHTxmDcf3m5d8uR/+XEMURV798QMLL3V42WvB2K3QaOYpqpQmc8GvWATLTfxsWYyxo+amrPtHcBpY3JGm+4TAhFkM+0ukqXKsq0AnlAGO/3OPYfJvWZvHB9VqFNewa+32wGXiGgFva+yCYiQHGKUFDIA2r3/MTUUxx8tIorDcjnmXU5pZYSVv03+gmtMHoErpxR4DMFcZon4U8LDg+JUHDmmPAykwYMZMdCSTwM5+7CxY17IGezTXmZg63lUmP0aS/dt3b4K3PmFDhJAyAz7PibSrx5tiUN2kM/qfWK1kc2+p2wnNtPnKp/Un8lDq19N4Vw3NpGRqB6B3Fv6EdnOt3CTi46jfR//u3GGnduTEESa47Dh7cF4pdmoryZbjvKp3nh2O0dwm6FGjJV6vch/uZ+QASA14uhnePFGsKeNbqSLz2/6Q/me2gj/f1FcpORbqxULEa/TN34PhjsaFeSwwUyOm6ez/rKETTVWoRdUSBTR8QUSrppCyvapSXgRRwZwH/PaWJ6fHb23mtQFUgjt/5c169M+Fy06AJMCkPeBh+E4V4BG0AAOXKptt3hZ+//8fzaEKF9F96DVHgu2RSrQ6i7HI7o7rJ/SHg4djCnsV6iYEEi1+o8z7P3hpxWmoY6iE8pb6Zmbrqp7xRC6SWuvRK6tSVdP+QuoUyIBG3O65rNEJsb2bps0dsMrA+oKa/4gidb+2N68obIH5OFU1hIphzy0xkkLx80VRiOVHRl1Y4yVvUlSVVj53xGohH7+LXvSEGgGC1uwMZ8xQKRKjvqBfiQC9Q4TLIs5ZYUCAWt/KtN7kNdYKT+MDGVihLYF6kcVBKGdPGmabXV/NhHQ1fCdemvrGuUrUABKjFx6mAplpOAnHqhDgQdc2RnlFcJfOWIofl4N4jBFVqo3bcrjV2d6HPyWjsNDWcmMAu2Xre+EBZ49y7g20Jn4ShctbU727UY/EloGvTYuBamqunuww//9HUDuFtu91hntE2NWjKZUpWmt1WYIfJEmOBNQMXrjRJBl6YSU5Oe03YhVKmOF3MyZTxfgAtxyVzp+Uo8aBVOl+Z50p6doSGbX7EUnmH0ciAR0xI89cTD0Eu4S64fBxPIgLMBqJhfr7a+yNVhrfdCs7/YTW+xHddqTqB3G5xQqkk66NFvdYLnVP46yf7E5Dtf4/GJc9jMZMzcRud6dWRx3sgs1hNZkpW+qsub2IRMS7odHwGJtI=
	ctl00$txbSearchBox:
	ctl00$cphMain$txbLast_nm: king
	ctl00$cphMain$txbFiling_year:
	ctl00$cphMain$ddlState:
	ctl00$cphMain$txbDistrict_cd:
	ctl00$cphMain$btnSearch: Search