avishekrk · June 22, 2016 21:24
diff --git a/PlotSummaryStats.py b/PlotSummaryStats.py
 #!/usr/bin/env python
 # coding: utf-8
 """
 Summary Stats for Water Work Orders
 Description
 -----------
 Python Script for producing figures and tables 
 for summary stats of Water Work Orders. 
 Figures are stored in **./figure** directory 
 and tables are stored in **./tables** directory. 
 Requires being able to connect to database. 
 Usage
 -----
 python SummaryStats_WaterWork_Orders.py 
 """

 import matplotlib
 matplotlib.use('Agg') #doesn't display plots 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import psycopg2
 import seaborn as sns



 def connect_to_syracuse_db():
    """
    Connects to syracuse database
    
    Output
    ------
    conn: object
       Database connection. 
    """
    sensitive = '/mnt/data/syracuse/credentials'
    line = open(sensitive,'r').readlines()
    creds = line[0].strip('\n').split(':')
    try:
        conn = psycopg2.connect(
            "dbname={0} user={1} host={2} password={3}".format(creds[3],creds[2],creds[0], creds[4]))
        return conn
    except 'ConnectionError':
        print "Cannot Connect to DB"
      



 def plot_job_category(data,numshow=10):
    """
    makes a barplot of the top job categories (default 10)
    the first and largest item in plotted in syracuse orange
    the rest are in syracuse grey
    
    Input 
    -----
    data: DataFrame
        DataFrame containing work orders and specifically job_category field
    numshow: int
        the number of work orders to display (default 10)
    
    Output
    ------
    figure: file (png)
        file is saved as figure_file_prefix+_hist_workorders.png
    table: file (csv)
        table is saved as table_file_prefix+_number_of_breaks_year.csv
    """
     
    field = 'job_category'
    syracuse_orange = '#D44500'
    syracuse_grey = '#3E3D3C'
    colors=[syracuse_orange] + [syracuse_grey]*(numshow-1)
    
    xcols=data.groupby(field).size().sort_values(ascending=False)[:numshow].index.values
    xticks=map(lambda x: x.split('/')[0],xcols)
    plt.figure(figsize=(32,18),dpi=500)
    sns.set_style("white")
    sns.set_context("poster", font_scale=2.50, rc={"lines.linewidth": 1.25, "lines.markersize":8})
    data.groupby(field).size().sort_values(ascending=False)[:numshow].plot(kind='bar', rot=60,color=colors)
    plt.xlabel('')
    plt.xticks(range(numshow),xticks,ha='right')
    plt.ylabel('Number of Work Orders')
    plt.tight_layout()
    sns.despine()
    plt.savefig(figure_file_prefix+'_hist_workorders.png', transparent=True)
    
    data.groupby(field).size().sort_values(ascending=False)[:numshow].to_csv(
        table_file_prefix+'_hist_workorders.csv')



 def plot_by_year(df,years=[2005,2015]):
    """
    Makes a plot of the number of main breaks/year
    
    Input
    -----
    df: DataFrame
       DataFrame containing only work orders for 'Main Break/Leak'
    years: ls
       range of years to make plot from default: 2005-2015
       
    Output
    ------
    figure: file (png)
        figure is saved as figure_file_prefix+_number_breaks_year.png
    table: file (csv)
        table is saved as table_file_prefix+number_of_breaks_year.csv
    """
    sns.set_style("whitegrid")
    plt.figure(figsize=(32,12),dpi=300)
    sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25, "lines.markersize":20})
    syracuse_orange = '#D44500'
    df.groupby('year').size().plot(marker='o',linestyle='--', color=syracuse_orange)
    plt.ylabel('Number of Main Breaks')
    plt.xlim(years)
    sns.despine()
    plt.savefig(figure_file_prefix+'_number_of_breaks_year.png', transparent=True)
    df.groupby('year').size().to_csv(table_file_prefix+'_number_of_breaks_year.csv')



 def number_of_break_month_avgyear(df):
    """
    Plots a barplot of the number of main breaks every month divided by the number of years.
    
    Input
    -----
    df: DataFrame
        DataFrame containing only work orders for 'Main Break/Leak'
    
    Output
    ------
    figure: file (png)
        figure is saved as figure_file_prefix+_number_break_a_month_avg_year.png
    table: file (csv)
        table is saved as table_file_prefix+number_of_break_a_month_avg_year.csv
    """
    
    numyears = len( df['year'].unique() ) 
    sns.set_style("whitegrid")
    plt.figure(figsize=(32,12),dpi=300)
    sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25, "lines.markersize":20})
    syracuse_orange = '#D44500'
    (df.groupby('month').size()/numyears).plot(marker='o',linestyle='--', color=syracuse_orange)
    plt.ylim(0,35)
    plt.xticks(range(1,13),
               ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
    plt.ylabel('Number of Breaks a Month/Year')
    sns.despine()
    plt.savefig(figure_file_prefix+'_number_break_a_month_avg_year.png', transparent=True)
    (df.groupby('month').size()/numyears).to_csv(table_file_prefix+'_number_break_a_month_avg_year.csv')



 def plot_main_break_timeseries(df,years=[2005,2015]):
    """
    Plot a timeseries of the number of main breaks between a range of years default 2005-2015
    
    Input
    -----
     df: DataFrame
        DataFrame containing only work orders for 'Main Break/Leak'
    
    Output
    ------
    figure: file (png)
        figure is saved as figure_file_prefix+_timeseries_mainbreaks_2005_2015.png
    table: file (csv)
        table is saved as table_file_prefix+_timeseries_mainbreaks_2005_2015.csv
    """
    
    sns.set_style("whitegrid")
    plt.figure(figsize=(42, 24),dpi=300)
    sns.set_context("poster", font_scale=4.00, rc={"lines.linewidth": 1.25,"lines.markersize":20})
    syracuse_orange = '#D44500'
    df.groupby(['year','month']).size().ix[years[0]:years[1]].plot(marker='o', color=syracuse_orange)
    plt.xlabel('')
    plt.ylabel('Number of Main Breaks')
    sns.despine()
    plt.savefig(figure_file_prefix+'_timeseries_mainbreaks_2005_2015.png', transparent=True)
    df.groupby(['year','month']).size().ix[years[0]:years[1]].to_csv(
        table_file_prefix+'_timeseries_mainbreaks_2005_2015.csv')



 conn = connect_to_syracuse_db()

 ls_dbs = ['work_orders_old', 'water_workorders'] 
 for dbname in ls_dbs:
    print dbname
    figdir='./figures/'
    tabledir='./tables/'
    table_file_prefix=tabledir+dbname
    figure_file_prefix=figdir+dbname
    
    query = "SELECT * from {0}".format(dbname)
    data=pd.read_sql_query(query, conn)
    
    data['timestamp']=pd.DatetimeIndex( data['date'] )
    data['year'] = pd.DatetimeIndex( data['date'] ).year
    data['month'] = pd.DatetimeIndex( data['date'] ).month

    plot_job_category(data)
    df_watermain_break = data.query('job_category=="Main Break/Leak"')
    plot_by_year(df_watermain_break)
    number_of_break_month_avgyear(df_watermain_break)
    plot_main_break_timeseries(df_watermain_break)


 #create table of water main breaks with Lat Long
 query = "SELECT * from {0}".format(ls_dbs[1])
 data_new_workorders=pd.read_sql_query(query, conn)
 cols = ['date','location', 'job_category', 'latitude', 'longitude']
 df_new_main_breaks = data_new_workorders[cols].query('job_category == "Main Break/Leak"')
 df_new_main_breaks.to_csv(table_file_prefix+'_new_watermain_breaks.csv',index=False)
	#!/usr/bin/env python
	# coding: utf-8
	"""
	Summary Stats for Water Work Orders
	Description
	-----------
	Python Script for producing figures and tables
	for summary stats of Water Work Orders.
	Figures are stored in ./figure directory
	and tables are stored in ./tables directory.
	Requires being able to connect to database.
	Usage
	-----
	python SummaryStats_WaterWork_Orders.py
	"""

	import matplotlib
	matplotlib.use('Agg') #doesn't display plots
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import psycopg2
	import seaborn as sns



	def connect_to_syracuse_db():
	"""
	Connects to syracuse database

	Output
	------
	conn: object
	Database connection.
	"""
	sensitive = '/mnt/data/syracuse/credentials'
	line = open(sensitive,'r').readlines()
	creds = line[0].strip('\n').split(':')
	try:
	conn = psycopg2.connect(
	"dbname={0} user={1} host={2} password={3}".format(creds[3],creds[2],creds[0], creds[4]))
	return conn
	except 'ConnectionError':
	print "Cannot Connect to DB"




	def plot_job_category(data,numshow=10):
	"""
	makes a barplot of the top job categories (default 10)
	the first and largest item in plotted in syracuse orange
	the rest are in syracuse grey

	Input
	-----
	data: DataFrame
	DataFrame containing work orders and specifically job_category field
	numshow: int
	the number of work orders to display (default 10)

	Output
	------
	figure: file (png)
	file is saved as figure_file_prefix+_hist_workorders.png
	table: file (csv)
	table is saved as table_file_prefix+_number_of_breaks_year.csv
	"""

	field = 'job_category'
	syracuse_orange = '#D44500'
	syracuse_grey = '#3E3D3C'
	colors=[syracuse_orange] + [syracuse_grey]*(numshow-1)

	xcols=data.groupby(field).size().sort_values(ascending=False)[:numshow].index.values
	xticks=map(lambda x: x.split('/')[0],xcols)
	plt.figure(figsize=(32,18),dpi=500)
	sns.set_style("white")
	sns.set_context("poster", font_scale=2.50, rc={"lines.linewidth": 1.25, "lines.markersize":8})
	data.groupby(field).size().sort_values(ascending=False)[:numshow].plot(kind='bar', rot=60,color=colors)
	plt.xlabel('')
	plt.xticks(range(numshow),xticks,ha='right')
	plt.ylabel('Number of Work Orders')
	plt.tight_layout()
	sns.despine()
	plt.savefig(figure_file_prefix+'_hist_workorders.png', transparent=True)

	data.groupby(field).size().sort_values(ascending=False)[:numshow].to_csv(
	table_file_prefix+'_hist_workorders.csv')



	def plot_by_year(df,years=[2005,2015]):
	"""
	Makes a plot of the number of main breaks/year

	Input
	-----
	df: DataFrame
	DataFrame containing only work orders for 'Main Break/Leak'
	years: ls
	range of years to make plot from default: 2005-2015

	Output
	------
	figure: file (png)
	figure is saved as figure_file_prefix+_number_breaks_year.png
	table: file (csv)
	table is saved as table_file_prefix+number_of_breaks_year.csv
	"""
	sns.set_style("whitegrid")
	plt.figure(figsize=(32,12),dpi=300)
	sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25, "lines.markersize":20})
	syracuse_orange = '#D44500'
	df.groupby('year').size().plot(marker='o',linestyle='--', color=syracuse_orange)
	plt.ylabel('Number of Main Breaks')
	plt.xlim(years)
	sns.despine()
	plt.savefig(figure_file_prefix+'_number_of_breaks_year.png', transparent=True)
	df.groupby('year').size().to_csv(table_file_prefix+'_number_of_breaks_year.csv')



	def number_of_break_month_avgyear(df):
	"""
	Plots a barplot of the number of main breaks every month divided by the number of years.

	Input
	-----
	df: DataFrame
	DataFrame containing only work orders for 'Main Break/Leak'

	Output
	------
	figure: file (png)
	figure is saved as figure_file_prefix+_number_break_a_month_avg_year.png
	table: file (csv)
	table is saved as table_file_prefix+number_of_break_a_month_avg_year.csv
	"""

	numyears = len( df['year'].unique() )
	sns.set_style("whitegrid")
	plt.figure(figsize=(32,12),dpi=300)
	sns.set_context("poster", font_scale=2.75, rc={"lines.linewidth": 2.25, "lines.markersize":20})
	syracuse_orange = '#D44500'
	(df.groupby('month').size()/numyears).plot(marker='o',linestyle='--', color=syracuse_orange)
	plt.ylim(0,35)
	plt.xticks(range(1,13),
	['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
	plt.ylabel('Number of Breaks a Month/Year')
	sns.despine()
	plt.savefig(figure_file_prefix+'_number_break_a_month_avg_year.png', transparent=True)
	(df.groupby('month').size()/numyears).to_csv(table_file_prefix+'_number_break_a_month_avg_year.csv')



	def plot_main_break_timeseries(df,years=[2005,2015]):
	"""
	Plot a timeseries of the number of main breaks between a range of years default 2005-2015

	Input
	-----
	df: DataFrame
	DataFrame containing only work orders for 'Main Break/Leak'

	Output
	------
	figure: file (png)
	figure is saved as figure_file_prefix+_timeseries_mainbreaks_2005_2015.png
	table: file (csv)
	table is saved as table_file_prefix+_timeseries_mainbreaks_2005_2015.csv
	"""

	sns.set_style("whitegrid")
	plt.figure(figsize=(42, 24),dpi=300)
	sns.set_context("poster", font_scale=4.00, rc={"lines.linewidth": 1.25,"lines.markersize":20})
	syracuse_orange = '#D44500'
	df.groupby(['year','month']).size().ix[years[0]:years[1]].plot(marker='o', color=syracuse_orange)
	plt.xlabel('')
	plt.ylabel('Number of Main Breaks')
	sns.despine()
	plt.savefig(figure_file_prefix+'_timeseries_mainbreaks_2005_2015.png', transparent=True)
	df.groupby(['year','month']).size().ix[years[0]:years[1]].to_csv(
	table_file_prefix+'_timeseries_mainbreaks_2005_2015.csv')



	conn = connect_to_syracuse_db()

	ls_dbs = ['work_orders_old', 'water_workorders']
	for dbname in ls_dbs:
	print dbname
	figdir='./figures/'
	tabledir='./tables/'
	table_file_prefix=tabledir+dbname
	figure_file_prefix=figdir+dbname

	query = "SELECT * from {0}".format(dbname)
	data=pd.read_sql_query(query, conn)

	data['timestamp']=pd.DatetimeIndex( data['date'] )
	data['year'] = pd.DatetimeIndex( data['date'] ).year
	data['month'] = pd.DatetimeIndex( data['date'] ).month

	plot_job_category(data)
	df_watermain_break = data.query('job_category=="Main Break/Leak"')
	plot_by_year(df_watermain_break)
	number_of_break_month_avgyear(df_watermain_break)
	plot_main_break_timeseries(df_watermain_break)


	#create table of water main breaks with Lat Long
	query = "SELECT * from {0}".format(ls_dbs[1])
	data_new_workorders=pd.read_sql_query(query, conn)
	cols = ['date','location', 'job_category', 'latitude', 'longitude']
	df_new_main_breaks = data_new_workorders[cols].query('job_category == "Main Break/Leak"')
	df_new_main_breaks.to_csv(table_file_prefix+'_new_watermain_breaks.csv',index=False)