Skip to content

Instantly share code, notes, and snippets.

@avishekrk
Created June 22, 2016 21:24
Show Gist options
  • Save avishekrk/1484ca9892a0dd4eef02a50b48eb20b0 to your computer and use it in GitHub Desktop.
Save avishekrk/1484ca9892a0dd4eef02a50b48eb20b0 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
"""
Summary Stats for Water Work Orders
Description
-----------
Python Script for producing figures and tables
for summary stats of Water Work Orders.
Figures are stored in **./figure** directory
and tables are stored in **./tables** directory.
Requires being able to connect to database.
Usage
-----
python SummaryStats_WaterWork_Orders.py
"""
import matplotlib
matplotlib.use('Agg') #doesn't display plots
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psycopg2
import seaborn as sns
def connect_to_syracuse_db():
"""
Connects to syracuse database
Output
------
conn: object
Database connection.
"""
sensitive = '/mnt/data/syracuse/credentials'
line = open(sensitive,'r').readlines()
creds = line[0].strip('\n').split(':')
try:
conn = psycopg2.connect(
"dbname={0} user={1} host={2} password={3}".format(creds[3],creds[2],creds[0], creds[4]))
return conn
except 'ConnectionError':
print "Cannot Connect to DB"
def plot_job_category(data,numshow=10):
"""
makes a barplot of the top job categories (default 10)
the first and largest item in plotted in syracuse orange
the rest are in syracuse grey
Input
-----
data: DataFrame
DataFrame containing work orders and specifically job_category field
numshow: int
the number of work orders to display (default 10)
Output
------
figure: file (png)
file is saved as figure_file_prefix+_hist_workorders.png
table: file (csv)
table is saved as table_file_prefix+_number_of_breaks_year.csv
"""
field = 'job_category'
syracuse_orange = '#D44500'
syracuse_grey = '#3E3D3C'
colors=[syracuse_orange] + [syracuse_grey]*(numshow-1)
xcols=data.groupby(field).size().sort_values(ascending=False)[:numshow].index.values
xticks=map(lambda x: x.split('/')[0],xcols)
plt.figure(figsize=(32,18),dpi=500)
sns.set_style("white")
sns.set_context("poster", font_scale=2.50, rc={"lines.linewidth": 1.25, "lines.markersize":8})
data.groupby(field).size().sort_values(ascending=False)[:numshow].plot(kind='bar', rot=60,color=colors)
plt.xlabel('')
plt.xticks(range(numshow),xticks,ha='right')
plt.ylabel('Number of Work Orders')
plt.tight_layout()
sns.despine()
plt.savefig(figure_file_prefix+'_hist_workorders.png', transparent=True)
data.groupby(field).size().sort_values(ascending=False)[:numshow].to_csv(
table_file_prefix+'_hist_workorders.csv')
def plot_by_year(df,years=[2005,2015]):
"""
Makes a plot of the number of main breaks/year
Input
-----
df: DataFrame
DataFrame containing only work orders for 'Main Break/Leak'
years: ls
range of years to make plot from default: 2005-2015
Output
------
figure: file (png)
figure is saved as figure_file_prefix+_number_breaks_year.png
table: file (csv)
table is saved as table_file_prefix+number_of_breaks_year.csv
"""
sns.set_style("whitegrid")
plt.figure(figsize=(32,12),dpi=300)
sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25, "lines.markersize":20})
syracuse_orange = '#D44500'
df.groupby('year').size().plot(marker='o',linestyle='--', color=syracuse_orange)
plt.ylabel('Number of Main Breaks')
plt.xlim(years)
sns.despine()
plt.savefig(figure_file_prefix+'_number_of_breaks_year.png', transparent=True)
df.groupby('year').size().to_csv(table_file_prefix+'_number_of_breaks_year.csv')
def number_of_break_month_avgyear(df):
"""
Plots a barplot of the number of main breaks every month divided by the number of years.
Input
-----
df: DataFrame
DataFrame containing only work orders for 'Main Break/Leak'
Output
------
figure: file (png)
figure is saved as figure_file_prefix+_number_break_a_month_avg_year.png
table: file (csv)
table is saved as table_file_prefix+number_of_break_a_month_avg_year.csv
"""
numyears = len( df['year'].unique() )
sns.set_style("whitegrid")
plt.figure(figsize=(32,12),dpi=300)
sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25, "lines.markersize":20})
syracuse_orange = '#D44500'
(df.groupby('month').size()/numyears).plot(marker='o',linestyle='--', color=syracuse_orange)
plt.ylim(0,35)
plt.xticks(range(1,13),
['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
plt.ylabel('Number of Breaks a Month/Year')
sns.despine()
plt.savefig(figure_file_prefix+'_number_break_a_month_avg_year.png', transparent=True)
(df.groupby('month').size()/numyears).to_csv(table_file_prefix+'_number_break_a_month_avg_year.csv')
def plot_main_break_timeseries(df,years=[2005,2015]):
"""
Plot a timeseries of the number of main breaks between a range of years default 2005-2015
Input
-----
df: DataFrame
DataFrame containing only work orders for 'Main Break/Leak'
Output
------
figure: file (png)
figure is saved as figure_file_prefix+_timeseries_mainbreaks_2005_2015.png
table: file (csv)
table is saved as table_file_prefix+_timeseries_mainbreaks_2005_2015.csv
"""
sns.set_style("whitegrid")
plt.figure(figsize=(42, 24),dpi=300)
sns.set_context("poster", font_scale=4.00, rc={"lines.linewidth": 1.25,"lines.markersize":20})
syracuse_orange = '#D44500'
df.groupby(['year','month']).size().ix[years[0]:years[1]].plot(marker='o', color=syracuse_orange)
plt.xlabel('')
plt.ylabel('Number of Main Breaks')
sns.despine()
plt.savefig(figure_file_prefix+'_timeseries_mainbreaks_2005_2015.png', transparent=True)
df.groupby(['year','month']).size().ix[years[0]:years[1]].to_csv(
table_file_prefix+'_timeseries_mainbreaks_2005_2015.csv')
conn = connect_to_syracuse_db()
ls_dbs = ['work_orders_old', 'water_workorders']
for dbname in ls_dbs:
print dbname
figdir='./figures/'
tabledir='./tables/'
table_file_prefix=tabledir+dbname
figure_file_prefix=figdir+dbname
query = "SELECT * from {0}".format(dbname)
data=pd.read_sql_query(query, conn)
data['timestamp']=pd.DatetimeIndex( data['date'] )
data['year'] = pd.DatetimeIndex( data['date'] ).year
data['month'] = pd.DatetimeIndex( data['date'] ).month
plot_job_category(data)
df_watermain_break = data.query('job_category=="Main Break/Leak"')
plot_by_year(df_watermain_break)
number_of_break_month_avgyear(df_watermain_break)
plot_main_break_timeseries(df_watermain_break)
#create table of water main breaks with Lat Long
query = "SELECT * from {0}".format(ls_dbs[1])
data_new_workorders=pd.read_sql_query(query, conn)
cols = ['date','location', 'job_category', 'latitude', 'longitude']
df_new_main_breaks = data_new_workorders[cols].query('job_category == "Main Break/Leak"')
df_new_main_breaks.to_csv(table_file_prefix+'_new_watermain_breaks.csv',index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment