Created
January 9, 2015 04:02
-
-
Save chris1610/12d7701df5c335ec98aa to your computer and use it in GitHub Desktop.
MN 2014 Budget Refactor - http://pbpython.com/web-scraping-mn-budget.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Parse 2014 MN Capital budget - https://www.revisor.mn.gov/laws/?year=2014&type=0&doctype=Chapter&id=294 | |
#Store the summary in a DataFrame for eventual manipulation | |
from __future__ import print_function | |
import os.path | |
from collections import defaultdict | |
import string | |
import requests | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
filename = "MNBudget-2014.html" | |
url = "https://www.revisor.mn.gov/laws/?year=2014&type=0&doctype=Chapter&id=294" | |
def convert_num(val): | |
""" | |
Convert the string number value to a float | |
- Remove all extra whitespace | |
- Remove commas | |
- If wrapped in (), then it is negative number | |
""" | |
val = string.strip(val).replace(",","").replace("(","-").replace(")","") | |
return float(val) | |
# As we work through the process, it is easier to | |
# download it once and work with the saved copy instead of | |
# trying to hit the server each time | |
# Just delete the output file to force a new download | |
def get_data(url, cachefile): | |
if os.path.isfile(cachefile): | |
print("Loading the data via the file.") | |
f = open(cachefile, 'r') | |
c = f.read() | |
else: | |
print("Fetching the data via the URL.") | |
result = requests.get(url) | |
c = result.content | |
f = open(cachefile,'w') | |
f.write(c) | |
f.close() | |
soup = BeautifulSoup(c) | |
return soup | |
def process_data(soup): | |
# Init the variables | |
# Use a defaultdict with an empty list because it eases the DataFrame creation | |
expense_lines = defaultdict(list) | |
funding_lines = defaultdict(list) | |
funding = False | |
# After looking at the data, we can see that the summary has a div id we can use | |
summary = soup.find("div", {"class":"bill_section","id": "laws.1.1.0"}) | |
# Get all the tables in the summary | |
tables = summary.find_all('table') | |
# The first table is not useful header info | |
# The second table contains all the we need (the list is 0 indexed) | |
data_table = tables[1] | |
#Go through each row of the table and pull out our data | |
for row in data_table.find_all("tr"): | |
cells = row.find_all("td") | |
# Ignore lines that don't have 3 cells of data because it is just spacing | |
if len(cells) == 3: | |
line = (string.strip(cells[0].text), convert_num(cells[2].text)) | |
# Once we get to the total line we start getting the funding lines | |
if line[0] == "TOTAL": | |
funding = True | |
# We don't want to capture the total because we can calc it | |
continue | |
if funding: | |
funding_lines[line[0]].append(line[1]) | |
else: | |
expense_lines[line[0]].append(line[1]) | |
return funding_lines, expense_lines | |
def graph_data_to_file(data, filename, title, plot_kwargs={}): | |
# Create the DataFrame using from_dict | |
data_df = pd.DataFrame.from_dict(data,orient='index') | |
# Label our column | |
data_df.rename(columns={0: 'Amount'}, inplace=True) | |
data_df = data_df.sort(columns='Amount') | |
#Set some nicer defaults for plots | |
pd.options.display.mpl_style = 'default' | |
data_bar = data_df.plot(kind='barh',title=title, **plot_kwargs) | |
plt.savefig(filename) | |
def main(): | |
data = get_data(url, filename) | |
funding, expenses = process_data(data) | |
graph_data_to_file(funding, "MN-2014-Funding.png", "2014 MN Capital Budget Funding") | |
graph_data_to_file(expenses, "MN-2014-Expense.png", "2014 MN Capital Budget Spending", {"figsize":[7, 13]}) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment