Skip to content

Instantly share code, notes, and snippets.

View kperry2215's full-sized avatar

Kirsten Perry kperry2215

View GitHub Profile
def plot_data(df, x_variable, y_variable, title):
"""
Plot the x- and y- variables against each other, where the variables are columns in
a pandas dataframe
df: Pandas dataframe.
x_variable: String. Name of x-variable column
y_variable: String. Name of y-variable column
title: String. Desired title name
"""
fig, ax = plt.subplots()
#Pull the day of month for each reading
electricity_demand_df['Day_Of_Month']=electricity_demand_df['Date_Time'].dt.day
#Pull the month of the year
electricity_demand_df['Month']=electricity_demand_df['Date_Time'].dt.month.apply(lambda x: calendar.month_abbr[x])
#Pull the year
electricity_demand_df['Year']=electricity_demand_df['Date_Time'].dt.year
#Calculate the hour with max demand for each date in the data set
electricity_demand_df['Peak_Demand_Hour_MWh_For_Day']=electricity_demand_df.groupby(['Day_Of_Month',
'Month',
#Create a histogram of counts by hour
ax=peak_demand_hour_df['Peak_Demand_Hour'].value_counts().plot(kind='bar',
title='Peak Demand Hour by Number of Occurrences')
ax.set_xlabel("Demand Hour (0-23 hour)")
ax.set_ylabel("Number of Occurrences")
def generate_histogram_of_aggregated_counts(df,
peak_demand_hour_column,
group_by_column):
"""
Generate a histogram of peak demand hour counts, grouped by a column
Arguments:
df: Pandas dataframe
peak_demand_hour_column: String. Name of the column for peak demand hour
group_by_column: String. Name of column to group by
"""
#Subset the dataframe to only include the features and labels that we're going to use
#in the model
peak_demand_hour_model=peak_demand_hour_df[['Peak_Demand_Hour',
'Day_Of_Week',
'Week',
'Month']]
#Convert the Week, Year, and Peak_Demand_Your variables into categorical string variables (from numeric)
peak_demand_hour_model.loc[:,'Week']=peak_demand_hour_model['Week'].apply(str)
peak_demand_hour_model.loc[:,'Peak_Demand_Hour']='Hour '+peak_demand_hour_model['Peak_Demand_Hour'].apply(str)
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features,
labels,
test_size = 0.25,
random_state = 5)
def grid_search_rf(parameter_grid, train_features, train_labels):
"""
Perform Grid Search on the random forest classifier model, in order to optimize model
parameters
parameter_grid: grid parameters to test against to determine optimal parameters
train_features: Numpy array, containing training set features
train_labels: Numpy array, containing training set labels
"""
# Create a random forest classifier model
rf = RandomForestClassifier()
#Plug in optimized model parameters into final RF model
rf = RandomForestClassifier(n_estimators=1100,
max_depth=100,
random_state = 1500)
#Fit the model
rf.fit(train_features, train_labels)
# Use the forest's predict method on the test data
print(confusion_matrix(test_labels,
rf.predict(test_features),
labels=['Hour 8', 'Hour 9', 'Hour 10',
#Obtain feature importances in the model
feature_importances = pd.DataFrame(rf.feature_importances_,
index = feature_list,
columns=['importance']).sort_values('importance',
ascending=False)
print(feature_importances)
def retrieve_time_series(api, series_ID):
"""
Return the time series dataframe, based on API and unique Series ID
api: API that we're connected to
series_ID: string. Name of the series that we want to pull from the EIA API
"""
#Retrieve Data By Series ID
series_search = api.data_by_series(series=series_ID)
##Create a pandas dataframe from the retrieved time series
df = pd.DataFrame(series_search)