Last active
October 10, 2017 05:27
-
-
Save AlJohri/80b5c4a55ddfc04eb2a24a413a0b79cd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import re | |
| import datetime | |
| import requests | |
| import lxml.html | |
| def strip_within(s): | |
| return re.sub(' +', ' ', s) | |
| def get_nomination(s, congress, number): | |
| url = f"https://www.congress.gov/nomination/{congress}/{number}" | |
| response = s.get(url) | |
| doc = lxml.html.fromstring(response.content) | |
| def parse_title(doc): | |
| title = strip_within(doc.cssselect("#content h1")[0].text.strip()) | |
| match1 = re.match(r"^PN(?P<number>\d+) — (?P<nominee>.*) — (?P<nominator>.*)$", title) | |
| if match1: | |
| group1 = match1.groupdict() | |
| group1['number'] = int(group1['number']) | |
| else: | |
| raise Exception("unable to parse nominee from title") | |
| match2 = re.match(r"(?P<congress>\d+)\w{2} Congress", doc.cssselect("#content h1 span")[0].text) | |
| if match2: | |
| group2 = match2.groupdict() | |
| group2['congress'] = int(group2['congress']) | |
| else: | |
| raise Exception("unable to parse congress from title") | |
| return {**group1, **group2} | |
| def parse_overview(doc): | |
| def convert_header(header): | |
| if header == 'Committee': | |
| return 'committees' | |
| else: | |
| return header.lower().replace(' ', '_') | |
| def parse_element(header, element): | |
| if header == 'Committee': | |
| return [x.text.strip() for x in element.getnext().cssselect('li')] | |
| else: | |
| return strip_within(element.getnext().text_content().strip()) | |
| headers = {x.text:x for x in doc.cssselect('.overview h2')} | |
| return { | |
| convert_header(header):parse_element(header, headers[header]) | |
| for header in headers} | |
| def parse_actions(doc): | |
| actions = [] | |
| for row in doc.cssselect("table.item_table > tbody > tr"): | |
| date_text = row.cssselect('td.date')[0].text | |
| date = datetime.datetime.strptime(date_text, '%m/%d/%Y') | |
| action = { | |
| 'date': date.strftime('%Y-%m-%d'), | |
| 'action': { | |
| 'text': row.cssselect('td.actions')[0].text_content().strip(), | |
| 'links': [{"href": a.get('href'), "text": a.text.strip()} | |
| for a in row.cssselect('td.actions a')] | |
| }, | |
| } | |
| actions.append(action) | |
| return actions | |
| return { | |
| **parse_title(doc), | |
| "overview": parse_overview(doc), | |
| "actions": parse_actions(doc) | |
| } | |
| def get_nominations(congress): | |
| query = {"source":"nominations","congress": str(congress)} | |
| return s.get(f"https://www.congress.gov/search?q={query}") | |
| if __name__ == "__main__": | |
| s = requests.Session() | |
| s.headers = { | |
| "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" | |
| } | |
| # response = get_nominations(115) | |
| nomination = get_nomination(s, 115, 54) | |
| import json | |
| print(json.dumps(nomination, indent=4)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "number": 54, | |
| "nominee": "Mick Mulvaney", | |
| "nominator": "Executive Office of the President", | |
| "congress": 115, | |
| "overview": { | |
| "description": "Mick Mulvaney, of South Carolina, to be Director of the Office of Management and Budget, vice Shaun L. S. Donovan, resigned.", | |
| "nominees": "One nomination, beginning with Mick Mulvaney and ending with Mick Mulvaney", | |
| "position": "To be Director of the Office of Management and Budget", | |
| "organization": "Executive Office of the President", | |
| "latest_action": "02/16/2017 - Confirmed by the Senate by Yea-Nay Vote. 51 - 49. Record Vote Number: 68.", | |
| "date_received_from_president": "01/30/2017", | |
| "committees": [ | |
| "Senate Budget", | |
| "Senate Homeland Security and Governmental Affairs" | |
| ] | |
| }, | |
| "actions": [ | |
| { | |
| "date": "2017-02-16", | |
| "action": { | |
| "text": "Confirmed by the Senate by Yea-Nay Vote. 51 - 49. Record Vote Number: 68.", | |
| "links": [ | |
| { | |
| "href": "http://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00068", | |
| "text": "Record Vote Number: 68" | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-16", | |
| "action": { | |
| "text": "Considered by Senate.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-15", | |
| "action": { | |
| "text": "By unanimous consent agreement, debate and vote 2/16/2017.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-15", | |
| "action": { | |
| "text": "Considered by Senate.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-15", | |
| "action": { | |
| "text": "Cloture invoked in Senate by Yea-Nay Vote. 52 - 48. Record Vote Number: 67.", | |
| "links": [ | |
| { | |
| "href": "http://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00067", | |
| "text": "Record Vote Number: 67" | |
| } | |
| ] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-14", | |
| "action": { | |
| "text": "By unanimous consent agreement, debate and vote 02/15/2017.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-13", | |
| "action": { | |
| "text": "By unanimous consent agreement, mandatory quorum under Rule XXII waived.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-13", | |
| "action": { | |
| "text": "Cloture motion presented in Senate.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-13", | |
| "action": { | |
| "text": "Motion to proceed to executive session to consideration of nomination agreed to in Senate by Voice Vote.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-02", | |
| "action": { | |
| "text": "Placed on Senate Executive Calendar. Calendar No. 16. Subject to nominee's commitment to respond to requests to appear and testify before any duly constituted committee of the Senate.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-02", | |
| "action": { | |
| "text": "Reported by Senator Enzi, Committee on the Budget, without printed report.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-02", | |
| "action": { | |
| "text": "Reported by Senator Johnson, Committee on Homeland Security and Governmental Affairs, without recommendation, and without printed report.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-02", | |
| "action": { | |
| "text": "Committee on the Budget. Ordered to be reported favorably.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-02-02", | |
| "action": { | |
| "text": "Committee on Homeland Security and Governmental Affairs. Ordered to be reported without recommendation.", | |
| "links": [] | |
| } | |
| }, | |
| { | |
| "date": "2017-01-30", | |
| "action": { | |
| "text": "Received in the Senate and referred jointly to the Committee on the Budget; Homeland Security and Governmental Affairs pursuant to S. Res. 445 of 10/09/2004.", | |
| "links": [] | |
| } | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment