Kirsten Perry kperry2215

Data scientist, Solar @ National Renewable Energy Lab. Formerly data scientist @ BP

kperry2215 / alpha_vantage_intraday.py

Created January 13, 2020 03:13

	from alpha_vantage.timeseries import TimeSeries
	import pandas as pd
	import matplotlib.pyplot as plt

	alpha_vantage_api_key = "YOUR API KEY HERE"

	def pull_intraday_time_series_alpha_vantage(alpha_vantage_api_key, ticker_name, data_interval = '15min'):
	"""
	Pull intraday time series data by stock ticker name.
	Args:

kperry2215 / visualize_forecast.py

Created January 4, 2020 23:42

	def plot_results(mean_predicted_values, confidence_interval_predicted_values, time_series):
	"""
	This function plots actual time series data against SARIMA model-predicted values.
	We include the confidence interval for the predictions.
	Args:
	mean_predicted_values: Series of float values. The model-predicted values.
	confidence_interval_predicted_values: Pandas dataframe, containing the lower and
	upper confidence intervals.
	time_series: Series of float values. Actual time series values that we want to graph
	Outputs:

kperry2215 / fit_predictions.py

Created January 4, 2020 23:41

	def fit_predictions(model_fit, steps_out_to_predict, actual_values):
	"""
	This function predicts the SARIMA model out a certain designated number of steps,
	and compares the predictions to the actual values. The root mean squared error and
	the mean absolute error are calculated, comparing the predicted and actual values.
	The function returns the predicted values and their respective confidence intervals.
	Args:
	model_fit: SARIMA model.
	steps_out_to_predict: Int. Number of steps out to predict the time series.
	actual_values: Series of actual time series values.

kperry2215 / seasonal_arima_model.py

Created January 4, 2020 23:37

	def seasonal_arima_model(time_series, order, seasonal_order, trend):
	"""
	Generate a seasonal ARIMA model using a set of hyperparameters. Returns the model fit, and the
	associated model AIC and BIC values.
	"""
	try:
	model = sm_api.tsa.SARIMAX(time_series,
	order=order,
	seasonal_order=seasonal_order,
	trend = trend,

kperry2215 / train_test_split.py

Created January 4, 2020 23:34

	def time_series_train_test_split(time_series, train_split_fraction):
	"""
	Split the data into training and test set.
	"""
	split_index = int(round(time_series.shape[0]*train_split_fraction, 0))
	train_set = time_series[:split_index]
	test_set = time_series[:-split_index]
	return train_set, test_set

	### EXECUTE IN MAIN FUNCTION ###

kperry2215 / hyperparameter_search_sarima.py

Created January 4, 2020 23:33

	def sarima_parameter_search(search_range, seasonal = [12]):
	"""
	Get all of the parameter combinations for a SARIMA model.
	"""
	p = q = d = range(0, search_range)
	trend = ['n','c','t','ct']
	pdq = list(itertools.product(p, d, q))
	pdq_combinations = [(x[0], x[1], x[2], x[3], x[4]) for x in list(itertools.product(p, d, q, seasonal, trend))]
	return pdq, seasonal_pdq_combinations

kperry2215 / time_series_decomp.py

Created January 4, 2020 23:31

	from statsmodels.tsa.seasonal import seasonal_decompose

	def decompose_time_series(series, frequency):
	"""
	Decompose a time series and plot it in the console
	Arguments:
	series: series. Time series that we want to decompose
	Outputs:
	Decomposition plot in the console
	"""

kperry2215 / pull_geothermal_data.py

Created January 4, 2020 23:29

	import eia
	import pandas as pd
	import matplotlib.pyplot as plt

	def retrieve_time_series(api, series_ID):
	"""
	Return the time series dataframe, based on API and unique Series ID
	"""
	#Retrieve Data By Series ID
	series_search = api.data_by_series(series=series_ID)

kperry2215 / mann_whitney_u_test.py

Created November 21, 2019 05:19

	def mann_whitney_u_test(distribution_1, distribution_2):
	"""
	Perform the Mann-Whitney U Test, comparing two different distributions.
	Args:
	distribution_1: List.
	distribution_2: List.
	Outputs:
	u_statistic: Float. U statisitic for the test.
	p_value: Float.
	"""

kperry2215 / subset_distribution.py

Created November 21, 2019 05:18

	#Subset the data into salary categories
	df_less_than_50k = df[df['salary'] == ' <=50K']
	df_greater_than_50k = df[df['salary'] == ' >50K']
	#Plot the histogram for the distribution for data <=$50K
	generate_distribution_histogram(df_less_than_50k, 'age',
	title = 'Age Distribution: US Population',
	x_axis_label = 'Age (years)',
	y_axis_label = 'Frequency',
	label_name = '<=$50K')
	#Plot the histogram for the distribution for data >$50K