Created
June 22, 2016 21:24
-
-
Save avishekrk/1484ca9892a0dd4eef02a50b48eb20b0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
""" | |
Summary Stats for Water Work Orders | |
Description | |
----------- | |
Python Script for producing figures and tables | |
for summary stats of Water Work Orders. | |
Figures are stored in **./figure** directory | |
and tables are stored in **./tables** directory. | |
Requires being able to connect to database. | |
Usage | |
----- | |
python SummaryStats_WaterWork_Orders.py | |
""" | |
import matplotlib | |
matplotlib.use('Agg') #doesn't display plots | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import pandas as pd | |
import psycopg2 | |
import seaborn as sns | |
def connect_to_syracuse_db(): | |
""" | |
Connects to syracuse database | |
Output | |
------ | |
conn: object | |
Database connection. | |
""" | |
sensitive = '/mnt/data/syracuse/credentials' | |
line = open(sensitive,'r').readlines() | |
creds = line[0].strip('\n').split(':') | |
try: | |
conn = psycopg2.connect( | |
"dbname={0} user={1} host={2} password={3}".format(creds[3],creds[2],creds[0], creds[4])) | |
return conn | |
except 'ConnectionError': | |
print "Cannot Connect to DB" | |
def plot_job_category(data,numshow=10): | |
""" | |
makes a barplot of the top job categories (default 10) | |
the first and largest item in plotted in syracuse orange | |
the rest are in syracuse grey | |
Input | |
----- | |
data: DataFrame | |
DataFrame containing work orders and specifically job_category field | |
numshow: int | |
the number of work orders to display (default 10) | |
Output | |
------ | |
figure: file (png) | |
file is saved as figure_file_prefix+_hist_workorders.png | |
table: file (csv) | |
table is saved as table_file_prefix+_number_of_breaks_year.csv | |
""" | |
field = 'job_category' | |
syracuse_orange = '#D44500' | |
syracuse_grey = '#3E3D3C' | |
colors=[syracuse_orange] + [syracuse_grey]*(numshow-1) | |
xcols=data.groupby(field).size().sort_values(ascending=False)[:numshow].index.values | |
xticks=map(lambda x: x.split('/')[0],xcols) | |
plt.figure(figsize=(32,18),dpi=500) | |
sns.set_style("white") | |
sns.set_context("poster", font_scale=2.50, rc={"lines.linewidth": 1.25, "lines.markersize":8}) | |
data.groupby(field).size().sort_values(ascending=False)[:numshow].plot(kind='bar', rot=60,color=colors) | |
plt.xlabel('') | |
plt.xticks(range(numshow),xticks,ha='right') | |
plt.ylabel('Number of Work Orders') | |
plt.tight_layout() | |
sns.despine() | |
plt.savefig(figure_file_prefix+'_hist_workorders.png', transparent=True) | |
data.groupby(field).size().sort_values(ascending=False)[:numshow].to_csv( | |
table_file_prefix+'_hist_workorders.csv') | |
def plot_by_year(df,years=[2005,2015]): | |
""" | |
Makes a plot of the number of main breaks/year | |
Input | |
----- | |
df: DataFrame | |
DataFrame containing only work orders for 'Main Break/Leak' | |
years: ls | |
range of years to make plot from default: 2005-2015 | |
Output | |
------ | |
figure: file (png) | |
figure is saved as figure_file_prefix+_number_breaks_year.png | |
table: file (csv) | |
table is saved as table_file_prefix+number_of_breaks_year.csv | |
""" | |
sns.set_style("whitegrid") | |
plt.figure(figsize=(32,12),dpi=300) | |
sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25, "lines.markersize":20}) | |
syracuse_orange = '#D44500' | |
df.groupby('year').size().plot(marker='o',linestyle='--', color=syracuse_orange) | |
plt.ylabel('Number of Main Breaks') | |
plt.xlim(years) | |
sns.despine() | |
plt.savefig(figure_file_prefix+'_number_of_breaks_year.png', transparent=True) | |
df.groupby('year').size().to_csv(table_file_prefix+'_number_of_breaks_year.csv') | |
def number_of_break_month_avgyear(df): | |
""" | |
Plots a barplot of the number of main breaks every month divided by the number of years. | |
Input | |
----- | |
df: DataFrame | |
DataFrame containing only work orders for 'Main Break/Leak' | |
Output | |
------ | |
figure: file (png) | |
figure is saved as figure_file_prefix+_number_break_a_month_avg_year.png | |
table: file (csv) | |
table is saved as table_file_prefix+number_of_break_a_month_avg_year.csv | |
""" | |
numyears = len( df['year'].unique() ) | |
sns.set_style("whitegrid") | |
plt.figure(figsize=(32,12),dpi=300) | |
sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25, "lines.markersize":20}) | |
syracuse_orange = '#D44500' | |
(df.groupby('month').size()/numyears).plot(marker='o',linestyle='--', color=syracuse_orange) | |
plt.ylim(0,35) | |
plt.xticks(range(1,13), | |
['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']) | |
plt.ylabel('Number of Breaks a Month/Year') | |
sns.despine() | |
plt.savefig(figure_file_prefix+'_number_break_a_month_avg_year.png', transparent=True) | |
(df.groupby('month').size()/numyears).to_csv(table_file_prefix+'_number_break_a_month_avg_year.csv') | |
def plot_main_break_timeseries(df,years=[2005,2015]): | |
""" | |
Plot a timeseries of the number of main breaks between a range of years default 2005-2015 | |
Input | |
----- | |
df: DataFrame | |
DataFrame containing only work orders for 'Main Break/Leak' | |
Output | |
------ | |
figure: file (png) | |
figure is saved as figure_file_prefix+_timeseries_mainbreaks_2005_2015.png | |
table: file (csv) | |
table is saved as table_file_prefix+_timeseries_mainbreaks_2005_2015.csv | |
""" | |
sns.set_style("whitegrid") | |
plt.figure(figsize=(42, 24),dpi=300) | |
sns.set_context("poster", font_scale=4.00, rc={"lines.linewidth": 1.25,"lines.markersize":20}) | |
syracuse_orange = '#D44500' | |
df.groupby(['year','month']).size().ix[years[0]:years[1]].plot(marker='o', color=syracuse_orange) | |
plt.xlabel('') | |
plt.ylabel('Number of Main Breaks') | |
sns.despine() | |
plt.savefig(figure_file_prefix+'_timeseries_mainbreaks_2005_2015.png', transparent=True) | |
df.groupby(['year','month']).size().ix[years[0]:years[1]].to_csv( | |
table_file_prefix+'_timeseries_mainbreaks_2005_2015.csv') | |
conn = connect_to_syracuse_db() | |
ls_dbs = ['work_orders_old', 'water_workorders'] | |
for dbname in ls_dbs: | |
print dbname | |
figdir='./figures/' | |
tabledir='./tables/' | |
table_file_prefix=tabledir+dbname | |
figure_file_prefix=figdir+dbname | |
query = "SELECT * from {0}".format(dbname) | |
data=pd.read_sql_query(query, conn) | |
data['timestamp']=pd.DatetimeIndex( data['date'] ) | |
data['year'] = pd.DatetimeIndex( data['date'] ).year | |
data['month'] = pd.DatetimeIndex( data['date'] ).month | |
plot_job_category(data) | |
df_watermain_break = data.query('job_category=="Main Break/Leak"') | |
plot_by_year(df_watermain_break) | |
number_of_break_month_avgyear(df_watermain_break) | |
plot_main_break_timeseries(df_watermain_break) | |
#create table of water main breaks with Lat Long | |
query = "SELECT * from {0}".format(ls_dbs[1]) | |
data_new_workorders=pd.read_sql_query(query, conn) | |
cols = ['date','location', 'job_category', 'latitude', 'longitude'] | |
df_new_main_breaks = data_new_workorders[cols].query('job_category == "Main Break/Leak"') | |
df_new_main_breaks.to_csv(table_file_prefix+'_new_watermain_breaks.csv',index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment