Skip to content

Instantly share code, notes, and snippets.

View kperry2215's full-sized avatar

Kirsten Perry kperry2215

View GitHub Profile
def grid_search_rf(parameter_grid, train_features, train_labels):
"""
Perform Grid Search on the random forest classifier model, in order to optimize model
parameters
parameter_grid: grid parameters to test against to determine optimal parameters
train_features: Numpy array, containing training set features
train_labels: Numpy array, containing training set labels
"""
# Create a random forest classifier model
rf = RandomForestClassifier()
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features,
labels,
test_size = 0.25,
random_state = 5)
#Subset the dataframe to only include the features and labels that we're going to use
#in the model
peak_demand_hour_model=peak_demand_hour_df[['Peak_Demand_Hour',
'Day_Of_Week',
'Week',
'Month']]
#Convert the Week, Year, and Peak_Demand_Your variables into categorical string variables (from numeric)
peak_demand_hour_model.loc[:,'Week']=peak_demand_hour_model['Week'].apply(str)
peak_demand_hour_model.loc[:,'Peak_Demand_Hour']='Hour '+peak_demand_hour_model['Peak_Demand_Hour'].apply(str)
def generate_histogram_of_aggregated_counts(df,
peak_demand_hour_column,
group_by_column):
"""
Generate a histogram of peak demand hour counts, grouped by a column
Arguments:
df: Pandas dataframe
peak_demand_hour_column: String. Name of the column for peak demand hour
group_by_column: String. Name of column to group by
"""
#Create a histogram of counts by hour
ax=peak_demand_hour_df['Peak_Demand_Hour'].value_counts().plot(kind='bar',
title='Peak Demand Hour by Number of Occurrences')
ax.set_xlabel("Demand Hour (0-23 hour)")
ax.set_ylabel("Number of Occurrences")
#Pull the day of month for each reading
electricity_demand_df['Day_Of_Month']=electricity_demand_df['Date_Time'].dt.day
#Pull the month of the year
electricity_demand_df['Month']=electricity_demand_df['Date_Time'].dt.month.apply(lambda x: calendar.month_abbr[x])
#Pull the year
electricity_demand_df['Year']=electricity_demand_df['Date_Time'].dt.year
#Calculate the hour with max demand for each date in the data set
electricity_demand_df['Peak_Demand_Hour_MWh_For_Day']=electricity_demand_df.groupby(['Day_Of_Month',
'Month',
def plot_data(df, x_variable, y_variable, title):
"""
Plot the x- and y- variables against each other, where the variables are columns in
a pandas dataframe
df: Pandas dataframe.
x_variable: String. Name of x-variable column
y_variable: String. Name of y-variable column
title: String. Desired title name
"""
fig, ax = plt.subplots()
import eia
import pandas as pd
def retrieve_time_series(api, series_ID):
"""
Return the time series dataframe, based on API and unique Series ID
api: API that we're connected to
series_ID: string. Name of the series that we want to pull from the EIA API
"""
#Retrieve Data By Series ID
import eia
import pandas as pd
def retrieve_time_series(api, series_ID):
"""
Return the time series dataframe, based on API and unique Series ID
api: API that we're connected to
series_ID: string. Name of the series that we want to pull from the EIA API
"""
#Retrieve Data By Series ID
def calculate_model_accuracy_metrics(actual, predicted):
"""
Output model accuracy metrics, comparing predicted values
to actual values.
Arguments:
actual: list. Time series of actual values.
predicted: list. Time series of predicted values
Outputs:
Forecast bias metrics, mean absolute error, mean squared error,
and root mean squared error in the console