Skip to content

Instantly share code, notes, and snippets.

View kperry2215's full-sized avatar

Kirsten Perry kperry2215

View GitHub Profile
def run_h2o_automl(dataframe, variable_to_predict,
max_number_models):
"""
This function initiates an h2o cluster, converts
the dataframe to an h2o dataframe, and then runs
the autoML function to generate a list of optimal
predictor models. The best models are displayed via a
scoreboard.
Arguments:
dataframe: Pandas dataframe.
performance = aml.leader.model_performance(test)
print(performance)
def run_tpot_automl(dataframe,
variable_to_predict,
number_generations,
file_to_export_pipeline_to = 'tpot_classifier_pipeline.py'):
"""
This function runs a TPOT classifier on the dataset, after splitting into
a training and test set, and then oversampling the training set.
Args:
dataframe: pandas dataframe. Master dataframe containing the feature and target
data
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import RobustScaler
from tpot.builtins import StackingEstimator
# NOTE: Make sure that the class is labeled 'target' in the data file
results_rounded = np.round(results)
confusion_matrix(testing_target, results_rounded)
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
#Declare the column names of the data set
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship',
'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
'native-country', 'salary']
import matplotlib.pyplot as plt
def generate_distribution_histogram(dataframe,
column_name,
title, x_axis_label, y_axis_label,
label_name,
number_bins = 15):
"""
This function generates a histogram.
Args:
#Subset the data into salary categories
df_less_than_50k = df[df['salary'] == ' <=50K']
df_greater_than_50k = df[df['salary'] == ' >50K']
#Plot the histogram for the distribution for data <=$50K
generate_distribution_histogram(df_less_than_50k, 'age',
title = 'Age Distribution: US Population',
x_axis_label = 'Age (years)',
y_axis_label = 'Frequency',
label_name = '<=$50K')
#Plot the histogram for the distribution for data >$50K
def mann_whitney_u_test(distribution_1, distribution_2):
"""
Perform the Mann-Whitney U Test, comparing two different distributions.
Args:
distribution_1: List.
distribution_2: List.
Outputs:
u_statistic: Float. U statisitic for the test.
p_value: Float.
"""
import eia
import pandas as pd
import matplotlib.pyplot as plt
def retrieve_time_series(api, series_ID):
"""
Return the time series dataframe, based on API and unique Series ID
"""
#Retrieve Data By Series ID
series_search = api.data_by_series(series=series_ID)