Kirsten Perry kperry2215

Data scientist, Solar @ National Renewable Energy Lab. Formerly data scientist @ BP

kperry2215 / h2o_automl.py

Created October 23, 2019 01:36

	def run_h2o_automl(dataframe, variable_to_predict,
	max_number_models):
	"""
	This function initiates an h2o cluster, converts
	the dataframe to an h2o dataframe, and then runs
	the autoML function to generate a list of optimal
	predictor models. The best models are displayed via a
	scoreboard.
	Arguments:
	dataframe: Pandas dataframe.

kperry2215 / aml_performance.py

Created October 23, 2019 01:38

	performance = aml.leader.model_performance(test)
	print(performance)

kperry2215 / run_tpot_automl.py

Created October 23, 2019 01:40

	def run_tpot_automl(dataframe,
	variable_to_predict,
	number_generations,
	file_to_export_pipeline_to = 'tpot_classifier_pipeline.py'):
	"""
	This function runs a TPOT classifier on the dataset, after splitting into
	a training and test set, and then oversampling the training set.
	Args:
	dataframe: pandas dataframe. Master dataframe containing the feature and target
	data

kperry2215 / tpot_automated_pipeline.py

Created October 23, 2019 01:41

	import numpy as np
	import pandas as pd
	from sklearn.ensemble import ExtraTreesClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import BernoulliNB
	from sklearn.pipeline import make_pipeline, make_union
	from sklearn.preprocessing import RobustScaler
	from tpot.builtins import StackingEstimator

	# NOTE: Make sure that the class is labeled 'target' in the data file

kperry2215 / tpot_results.py

Created October 23, 2019 01:42

	results_rounded = np.round(results)
	confusion_matrix(testing_target, results_rounded)

kperry2215 / adult_dataset.py

Created November 21, 2019 05:15

	df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
	#Declare the column names of the data set
	df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
	'marital-status', 'occupation', 'relationship',
	'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
	'native-country', 'salary']

kperry2215 / distribution_histogram.py

Created November 21, 2019 05:16

	import matplotlib.pyplot as plt

	def generate_distribution_histogram(dataframe,
	column_name,
	title, x_axis_label, y_axis_label,
	label_name,
	number_bins = 15):
	"""
	This function generates a histogram.
	Args:

kperry2215 / subset_distribution.py

Created November 21, 2019 05:18

	#Subset the data into salary categories
	df_less_than_50k = df[df['salary'] == ' <=50K']
	df_greater_than_50k = df[df['salary'] == ' >50K']
	#Plot the histogram for the distribution for data <=$50K
	generate_distribution_histogram(df_less_than_50k, 'age',
	title = 'Age Distribution: US Population',
	x_axis_label = 'Age (years)',
	y_axis_label = 'Frequency',
	label_name = '<=$50K')
	#Plot the histogram for the distribution for data >$50K

kperry2215 / mann_whitney_u_test.py

Created November 21, 2019 05:19

	def mann_whitney_u_test(distribution_1, distribution_2):
	"""
	Perform the Mann-Whitney U Test, comparing two different distributions.
	Args:
	distribution_1: List.
	distribution_2: List.
	Outputs:
	u_statistic: Float. U statisitic for the test.
	p_value: Float.
	"""

kperry2215 / pull_geothermal_data.py

Created January 4, 2020 23:29

	import eia
	import pandas as pd
	import matplotlib.pyplot as plt

	def retrieve_time_series(api, series_ID):
	"""
	Return the time series dataframe, based on API and unique Series ID
	"""
	#Retrieve Data By Series ID
	series_search = api.data_by_series(series=series_ID)