Kirsten Perry kperry2215

Data scientist, Solar @ National Renewable Energy Lab. Formerly data scientist @ BP

kperry2215 / subset_distribution.py

Created November 21, 2019 05:18

	#Subset the data into salary categories
	df_less_than_50k = df[df['salary'] == ' <=50K']
	df_greater_than_50k = df[df['salary'] == ' >50K']
	#Plot the histogram for the distribution for data <=$50K
	generate_distribution_histogram(df_less_than_50k, 'age',
	title = 'Age Distribution: US Population',
	x_axis_label = 'Age (years)',
	y_axis_label = 'Frequency',
	label_name = '<=$50K')
	#Plot the histogram for the distribution for data >$50K

kperry2215 / distribution_histogram.py

Created November 21, 2019 05:16

	import matplotlib.pyplot as plt

	def generate_distribution_histogram(dataframe,
	column_name,
	title, x_axis_label, y_axis_label,
	label_name,
	number_bins = 15):
	"""
	This function generates a histogram.
	Args:

kperry2215 / adult_dataset.py

Created November 21, 2019 05:15

	df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
	#Declare the column names of the data set
	df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
	'marital-status', 'occupation', 'relationship',
	'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
	'native-country', 'salary']

kperry2215 / tpot_results.py

Created October 23, 2019 01:42

	results_rounded = np.round(results)
	confusion_matrix(testing_target, results_rounded)

kperry2215 / tpot_automated_pipeline.py

Created October 23, 2019 01:41

	import numpy as np
	import pandas as pd
	from sklearn.ensemble import ExtraTreesClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import BernoulliNB
	from sklearn.pipeline import make_pipeline, make_union
	from sklearn.preprocessing import RobustScaler
	from tpot.builtins import StackingEstimator

	# NOTE: Make sure that the class is labeled 'target' in the data file

kperry2215 / run_tpot_automl.py

Created October 23, 2019 01:40

	def run_tpot_automl(dataframe,
	variable_to_predict,
	number_generations,
	file_to_export_pipeline_to = 'tpot_classifier_pipeline.py'):
	"""
	This function runs a TPOT classifier on the dataset, after splitting into
	a training and test set, and then oversampling the training set.
	Args:
	dataframe: pandas dataframe. Master dataframe containing the feature and target
	data

kperry2215 / aml_performance.py

Created October 23, 2019 01:38

	performance = aml.leader.model_performance(test)
	print(performance)

kperry2215 / h2o_automl.py

Created October 23, 2019 01:36

	def run_h2o_automl(dataframe, variable_to_predict,
	max_number_models):
	"""
	This function initiates an h2o cluster, converts
	the dataframe to an h2o dataframe, and then runs
	the autoML function to generate a list of optimal
	predictor models. The best models are displayed via a
	scoreboard.
	Arguments:
	dataframe: Pandas dataframe.

kperry2215 / breast_cancer_data_set.py

Created October 23, 2019 01:34

	#Read in the cancer data set
	df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data', header=None)
	#Declare the column names of the cancer data set
	df.columns=["Class", "Age", "Menopause",
	"Tumor_Size", "Inv_Nodes",
	"Node_Caps", "Deg_Malig",
	"Breast", "Breast_quad",
	"Irradiat"]
	#Convert all of the categorical features variables to numeric (use LabelEncoder)
	d = defaultdict(LabelEncoder)

kperry2215 / one_class_SVM.py

Created August 24, 2019 00:29

	def one_class_SVM_anomaly_detection(dataframe, columns_to_filter_by, outliers_fraction):
	"""
	In this definition, time series anomalies are detected
	using a One Class SVM algorithm.
	Arguments:
	df: Pandas dataframe
	columns_to_filter_by: string, or list of strings. Name of the column(s) that
	we want to use in the One Class SVM to detect time series anomalies
	outliers_fraction: float. Percentage of outliers allowed in the sequence.
	Outputs: