Skip to content

Instantly share code, notes, and snippets.

@AlJohri
Last active October 10, 2017 05:27
Show Gist options
  • Select an option

  • Save AlJohri/80b5c4a55ddfc04eb2a24a413a0b79cd to your computer and use it in GitHub Desktop.

Select an option

Save AlJohri/80b5c4a55ddfc04eb2a24a413a0b79cd to your computer and use it in GitHub Desktop.
import re
import datetime
import requests
import lxml.html
def strip_within(s):
return re.sub(' +', ' ', s)
def get_nomination(s, congress, number):
url = f"https://www.congress.gov/nomination/{congress}/{number}"
response = s.get(url)
doc = lxml.html.fromstring(response.content)
def parse_title(doc):
title = strip_within(doc.cssselect("#content h1")[0].text.strip())
match1 = re.match(r"^PN(?P<number>\d+) — (?P<nominee>.*) — (?P<nominator>.*)$", title)
if match1:
group1 = match1.groupdict()
group1['number'] = int(group1['number'])
else:
raise Exception("unable to parse nominee from title")
match2 = re.match(r"(?P<congress>\d+)\w{2} Congress", doc.cssselect("#content h1 span")[0].text)
if match2:
group2 = match2.groupdict()
group2['congress'] = int(group2['congress'])
else:
raise Exception("unable to parse congress from title")
return {**group1, **group2}
def parse_overview(doc):
def convert_header(header):
if header == 'Committee':
return 'committees'
else:
return header.lower().replace(' ', '_')
def parse_element(header, element):
if header == 'Committee':
return [x.text.strip() for x in element.getnext().cssselect('li')]
else:
return strip_within(element.getnext().text_content().strip())
headers = {x.text:x for x in doc.cssselect('.overview h2')}
return {
convert_header(header):parse_element(header, headers[header])
for header in headers}
def parse_actions(doc):
actions = []
for row in doc.cssselect("table.item_table > tbody > tr"):
date_text = row.cssselect('td.date')[0].text
date = datetime.datetime.strptime(date_text, '%m/%d/%Y')
action = {
'date': date.strftime('%Y-%m-%d'),
'action': {
'text': row.cssselect('td.actions')[0].text_content().strip(),
'links': [{"href": a.get('href'), "text": a.text.strip()}
for a in row.cssselect('td.actions a')]
},
}
actions.append(action)
return actions
return {
**parse_title(doc),
"overview": parse_overview(doc),
"actions": parse_actions(doc)
}
def get_nominations(congress):
query = {"source":"nominations","congress": str(congress)}
return s.get(f"https://www.congress.gov/search?q={query}")
if __name__ == "__main__":
s = requests.Session()
s.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36"
}
# response = get_nominations(115)
nomination = get_nomination(s, 115, 54)
import json
print(json.dumps(nomination, indent=4))
{
"number": 54,
"nominee": "Mick Mulvaney",
"nominator": "Executive Office of the President",
"congress": 115,
"overview": {
"description": "Mick Mulvaney, of South Carolina, to be Director of the Office of Management and Budget, vice Shaun L. S. Donovan, resigned.",
"nominees": "One nomination, beginning with Mick Mulvaney and ending with Mick Mulvaney",
"position": "To be Director of the Office of Management and Budget",
"organization": "Executive Office of the President",
"latest_action": "02/16/2017 - Confirmed by the Senate by Yea-Nay Vote. 51 - 49. Record Vote Number: 68.",
"date_received_from_president": "01/30/2017",
"committees": [
"Senate Budget",
"Senate Homeland Security and Governmental Affairs"
]
},
"actions": [
{
"date": "2017-02-16",
"action": {
"text": "Confirmed by the Senate by Yea-Nay Vote. 51 - 49. Record Vote Number: 68.",
"links": [
{
"href": "http://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00068",
"text": "Record Vote Number: 68"
}
]
}
},
{
"date": "2017-02-16",
"action": {
"text": "Considered by Senate.",
"links": []
}
},
{
"date": "2017-02-15",
"action": {
"text": "By unanimous consent agreement, debate and vote 2/16/2017.",
"links": []
}
},
{
"date": "2017-02-15",
"action": {
"text": "Considered by Senate.",
"links": []
}
},
{
"date": "2017-02-15",
"action": {
"text": "Cloture invoked in Senate by Yea-Nay Vote. 52 - 48. Record Vote Number: 67.",
"links": [
{
"href": "http://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00067",
"text": "Record Vote Number: 67"
}
]
}
},
{
"date": "2017-02-14",
"action": {
"text": "By unanimous consent agreement, debate and vote 02/15/2017.",
"links": []
}
},
{
"date": "2017-02-13",
"action": {
"text": "By unanimous consent agreement, mandatory quorum under Rule XXII waived.",
"links": []
}
},
{
"date": "2017-02-13",
"action": {
"text": "Cloture motion presented in Senate.",
"links": []
}
},
{
"date": "2017-02-13",
"action": {
"text": "Motion to proceed to executive session to consideration of nomination agreed to in Senate by Voice Vote.",
"links": []
}
},
{
"date": "2017-02-02",
"action": {
"text": "Placed on Senate Executive Calendar. Calendar No. 16. Subject to nominee's commitment to respond to requests to appear and testify before any duly constituted committee of the Senate.",
"links": []
}
},
{
"date": "2017-02-02",
"action": {
"text": "Reported by Senator Enzi, Committee on the Budget, without printed report.",
"links": []
}
},
{
"date": "2017-02-02",
"action": {
"text": "Reported by Senator Johnson, Committee on Homeland Security and Governmental Affairs, without recommendation, and without printed report.",
"links": []
}
},
{
"date": "2017-02-02",
"action": {
"text": "Committee on the Budget. Ordered to be reported favorably.",
"links": []
}
},
{
"date": "2017-02-02",
"action": {
"text": "Committee on Homeland Security and Governmental Affairs. Ordered to be reported without recommendation.",
"links": []
}
},
{
"date": "2017-01-30",
"action": {
"text": "Received in the Senate and referred jointly to the Committee on the Budget; Homeland Security and Governmental Affairs pursuant to S. Res. 445 of 10/09/2004.",
"links": []
}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment