This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def plot_data(df, x_variable, y_variable, title): | |
""" | |
Plot the x- and y- variables against each other, where the variables are columns in | |
a pandas dataframe | |
df: Pandas dataframe. | |
x_variable: String. Name of x-variable column | |
y_variable: String. Name of y-variable column | |
title: String. Desired title name | |
""" | |
fig, ax = plt.subplots() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Pull the day of month for each reading | |
electricity_demand_df['Day_Of_Month']=electricity_demand_df['Date_Time'].dt.day | |
#Pull the month of the year | |
electricity_demand_df['Month']=electricity_demand_df['Date_Time'].dt.month.apply(lambda x: calendar.month_abbr[x]) | |
#Pull the year | |
electricity_demand_df['Year']=electricity_demand_df['Date_Time'].dt.year | |
#Calculate the hour with max demand for each date in the data set | |
electricity_demand_df['Peak_Demand_Hour_MWh_For_Day']=electricity_demand_df.groupby(['Day_Of_Month', | |
'Month', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Create a histogram of counts by hour | |
ax=peak_demand_hour_df['Peak_Demand_Hour'].value_counts().plot(kind='bar', | |
title='Peak Demand Hour by Number of Occurrences') | |
ax.set_xlabel("Demand Hour (0-23 hour)") | |
ax.set_ylabel("Number of Occurrences") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def generate_histogram_of_aggregated_counts(df, | |
peak_demand_hour_column, | |
group_by_column): | |
""" | |
Generate a histogram of peak demand hour counts, grouped by a column | |
Arguments: | |
df: Pandas dataframe | |
peak_demand_hour_column: String. Name of the column for peak demand hour | |
group_by_column: String. Name of column to group by | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Subset the dataframe to only include the features and labels that we're going to use | |
#in the model | |
peak_demand_hour_model=peak_demand_hour_df[['Peak_Demand_Hour', | |
'Day_Of_Week', | |
'Week', | |
'Month']] | |
#Convert the Week, Year, and Peak_Demand_Your variables into categorical string variables (from numeric) | |
peak_demand_hour_model.loc[:,'Week']=peak_demand_hour_model['Week'].apply(str) | |
peak_demand_hour_model.loc[:,'Peak_Demand_Hour']='Hour '+peak_demand_hour_model['Peak_Demand_Hour'].apply(str) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Split the data into training and testing sets | |
train_features, test_features, train_labels, test_labels = train_test_split(features, | |
labels, | |
test_size = 0.25, | |
random_state = 5) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def grid_search_rf(parameter_grid, train_features, train_labels): | |
""" | |
Perform Grid Search on the random forest classifier model, in order to optimize model | |
parameters | |
parameter_grid: grid parameters to test against to determine optimal parameters | |
train_features: Numpy array, containing training set features | |
train_labels: Numpy array, containing training set labels | |
""" | |
# Create a random forest classifier model | |
rf = RandomForestClassifier() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Plug in optimized model parameters into final RF model | |
rf = RandomForestClassifier(n_estimators=1100, | |
max_depth=100, | |
random_state = 1500) | |
#Fit the model | |
rf.fit(train_features, train_labels) | |
# Use the forest's predict method on the test data | |
print(confusion_matrix(test_labels, | |
rf.predict(test_features), | |
labels=['Hour 8', 'Hour 9', 'Hour 10', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Obtain feature importances in the model | |
feature_importances = pd.DataFrame(rf.feature_importances_, | |
index = feature_list, | |
columns=['importance']).sort_values('importance', | |
ascending=False) | |
print(feature_importances) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def retrieve_time_series(api, series_ID): | |
""" | |
Return the time series dataframe, based on API and unique Series ID | |
api: API that we're connected to | |
series_ID: string. Name of the series that we want to pull from the EIA API | |
""" | |
#Retrieve Data By Series ID | |
series_search = api.data_by_series(series=series_ID) | |
##Create a pandas dataframe from the retrieved time series | |
df = pd.DataFrame(series_search) |