Kirsten Perry kperry2215

Data scientist, Solar @ National Renewable Energy Lab. Formerly data scientist @ BP

kperry2215 / plot_time_series_snapshots.py

Created July 30, 2019 22:12

	def plot_data(df, x_variable, y_variable, title):
	"""
	Plot the x- and y- variables against each other, where the variables are columns in
	a pandas dataframe
	df: Pandas dataframe.
	x_variable: String. Name of x-variable column
	y_variable: String. Name of y-variable column
	title: String. Desired title name
	"""
	fig, ax = plt.subplots()

kperry2215 / subset_peak_demand.py

Created July 30, 2019 22:19

	#Pull the day of month for each reading
	electricity_demand_df['Day_Of_Month']=electricity_demand_df['Date_Time'].dt.day
	#Pull the month of the year
	electricity_demand_df['Month']=electricity_demand_df['Date_Time'].dt.month.apply(lambda x: calendar.month_abbr[x])
	#Pull the year
	electricity_demand_df['Year']=electricity_demand_df['Date_Time'].dt.year

	#Calculate the hour with max demand for each date in the data set
	electricity_demand_df['Peak_Demand_Hour_MWh_For_Day']=electricity_demand_df.groupby(['Day_Of_Month',
	'Month',

kperry2215 / generate_histogram_peak_demand_counts.py

Created July 30, 2019 22:20

	#Create a histogram of counts by hour
	ax=peak_demand_hour_df['Peak_Demand_Hour'].value_counts().plot(kind='bar',
	title='Peak Demand Hour by Number of Occurrences')
	ax.set_xlabel("Demand Hour (0-23 hour)")
	ax.set_ylabel("Number of Occurrences")

kperry2215 / aggregated_histogram.py

Created July 30, 2019 22:23

	def generate_histogram_of_aggregated_counts(df,
	peak_demand_hour_column,
	group_by_column):
	"""
	Generate a histogram of peak demand hour counts, grouped by a column
	Arguments:
	df: Pandas dataframe
	peak_demand_hour_column: String. Name of the column for peak demand hour
	group_by_column: String. Name of column to group by
	"""

kperry2215 / feature_preprocess.py

Last active August 1, 2019 04:42

	#Subset the dataframe to only include the features and labels that we're going to use
	#in the model
	peak_demand_hour_model=peak_demand_hour_df[['Peak_Demand_Hour',
	'Day_Of_Week',
	'Week',
	'Month']]
	#Convert the Week, Year, and Peak_Demand_Your variables into categorical string variables (from numeric)
	peak_demand_hour_model.loc[:,'Week']=peak_demand_hour_model['Week'].apply(str)
	peak_demand_hour_model.loc[:,'Peak_Demand_Hour']='Hour '+peak_demand_hour_model['Peak_Demand_Hour'].apply(str)

kperry2215 / test_train_split.py

Created July 30, 2019 22:29

	# Split the data into training and testing sets
	train_features, test_features, train_labels, test_labels = train_test_split(features,
	labels,
	test_size = 0.25,
	random_state = 5)

kperry2215 / gridsearchcv.py

Created July 30, 2019 22:31

	def grid_search_rf(parameter_grid, train_features, train_labels):
	"""
	Perform Grid Search on the random forest classifier model, in order to optimize model
	parameters
	parameter_grid: grid parameters to test against to determine optimal parameters
	train_features: Numpy array, containing training set features
	train_labels: Numpy array, containing training set labels
	"""
	# Create a random forest classifier model
	rf = RandomForestClassifier()

kperry2215 / build_rf_model.py

Last active July 31, 2019 04:08

	#Plug in optimized model parameters into final RF model
	rf = RandomForestClassifier(n_estimators=1100,
	max_depth=100,
	random_state = 1500)
	#Fit the model
	rf.fit(train_features, train_labels)
	# Use the forest's predict method on the test data
	print(confusion_matrix(test_labels,
	rf.predict(test_features),
	labels=['Hour 8', 'Hour 9', 'Hour 10',

kperry2215 / feature_importances.py

Created July 30, 2019 22:38

	#Obtain feature importances in the model
	feature_importances = pd.DataFrame(rf.feature_importances_,
	index = feature_list,
	columns=['importance']).sort_values('importance',
	ascending=False)
	print(feature_importances)

kperry2215 / ruptures_example.py

Last active August 15, 2019 02:03

	def retrieve_time_series(api, series_ID):
	"""
	Return the time series dataframe, based on API and unique Series ID
	api: API that we're connected to
	series_ID: string. Name of the series that we want to pull from the EIA API
	"""
	#Retrieve Data By Series ID
	series_search = api.data_by_series(series=series_ID)
	##Create a pandas dataframe from the retrieved time series
	df = pd.DataFrame(series_search)