Kirsten Perry kperry2215

Data scientist, Solar @ National Renewable Energy Lab. Formerly data scientist @ BP

kperry2215 / ruptures_code.py

Last active August 15, 2019 02:06

	def retrieve_time_series(api, series_ID):
	"""
	Return the time series dataframe, based on API and unique Series ID
	api: API that we're connected to
	series_ID: string. Name of the series that we want to pull from the EIA API
	"""
	#Retrieve Data By Series ID
	series_search = api.data_by_series(series=series_ID)
	##Create a pandas dataframe from the retrieved time series
	df = pd.DataFrame(series_search)

kperry2215 / ruptures.py

Last active August 15, 2019 02:12

	#Create EIA API using your specific API key
	api_key = 'YOUR API KEY HERE'
	api = eia.API(api_key)

	#Pull the oil WTI price data
	series_ID='PET.RWTC.D'
	#Retrieve Data By Series ID
	series_search = api.data_by_series(series=series_ID)
	##Create a pandas dataframe from the retrieved time series
	price_df = pd.DataFrame(series_search)

kperry2215 / plot_gasoline_prices.py

Created August 24, 2019 00:20

	import pandas as pd
	import matplotlib.pyplot as plt
	import eia

	def retrieve_time_series(api, series_ID):
	"""
	Return the time series dataframe, based on API and unique Series ID
	Arguments:
	api: API that we're connected to
	series_ID: string. Name of the series that we want to pull from the EIA API

kperry2215 / generate_anomalies.py

Created August 24, 2019 00:22

	#Add in a couple anomalous data points for detection by the algorithm
	anomaly_dictionary={80: 3.1,
	200: 3,
	333: 1,
	600: 2.6,
	710: 2.1,
	890: 2.3,
	1100: 1,
	1211: 2.6,
	1309: 2.3}

kperry2215 / low_pass_filter.py

Created August 24, 2019 00:23

	def low_pass_filter_anomaly_detection(df,
	column_name,
	number_of_stdevs_away_from_mean):
	"""
	Implement a low-pass filter to detect anomalies in a time series, and save the filter outputs
	(True/False) to a new column in the dataframe.
	Arguments:
	df: Pandas dataframe
	column_name: string. Name of the column that we want to detect anomalies in
	number_of_stdevs_away_from_mean: float. Number of standard deviations away from

kperry2215 / isolation_forest.py

Created August 24, 2019 00:25

	def isolation_forest_anomaly_detection(df,
	column_name,
	outliers_fraction):
	"""
	In this definition, time series anomalies are detected using an Isolation Forest algorithm.
	Arguments:
	df: Pandas dataframe
	column_name: string. Name of the column that we want to detect anomalies in
	outliers_fraction: float. Percentage of outliers allowed in the sequence.
	Outputs:

kperry2215 / stl_decomposition.py

Created August 24, 2019 00:26

	def decompose_time_series(series, desired_frequency):
	"""
	Perform STL decomposition on the time series.
	Arguments:
	series: Pandas series. Time series sequence that we wish to decompose.
	desired_frequency: Integer. Time frequency of the series. If we want to detect
	a yearly trend, we'd set the value equal to 365.
	Outputs:
	Plot of time series STL decomposition.
	"""

kperry2215 / sesd_algorithm.py

Created August 24, 2019 00:28

	def sesd_anomaly_detection(dataframe,
	column_name,
	desired_frequency,
	max_anomalies,
	alpha_level):
	"""
	In this definition, time series anomalies are detected using the S-ESD algorithm.
	Arguments:
	dataframe: Pandas dataframe
	column_name: string. Name of the column that we want to detect anomalies in

kperry2215 / one_class_SVM.py

Created August 24, 2019 00:29

	def one_class_SVM_anomaly_detection(dataframe, columns_to_filter_by, outliers_fraction):
	"""
	In this definition, time series anomalies are detected
	using a One Class SVM algorithm.
	Arguments:
	df: Pandas dataframe
	columns_to_filter_by: string, or list of strings. Name of the column(s) that
	we want to use in the One Class SVM to detect time series anomalies
	outliers_fraction: float. Percentage of outliers allowed in the sequence.
	Outputs:

kperry2215 / breast_cancer_data_set.py

Created October 23, 2019 01:34

	#Read in the cancer data set
	df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data', header=None)
	#Declare the column names of the cancer data set
	df.columns=["Class", "Age", "Menopause",
	"Tumor_Size", "Inv_Nodes",
	"Node_Caps", "Deg_Malig",
	"Breast", "Breast_quad",
	"Irradiat"]
	#Convert all of the categorical features variables to numeric (use LabelEncoder)
	d = defaultdict(LabelEncoder)