This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def run_h2o_automl(dataframe, variable_to_predict, | |
max_number_models): | |
""" | |
This function initiates an h2o cluster, converts | |
the dataframe to an h2o dataframe, and then runs | |
the autoML function to generate a list of optimal | |
predictor models. The best models are displayed via a | |
scoreboard. | |
Arguments: | |
dataframe: Pandas dataframe. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
performance = aml.leader.model_performance(test) | |
print(performance) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def run_tpot_automl(dataframe, | |
variable_to_predict, | |
number_generations, | |
file_to_export_pipeline_to = 'tpot_classifier_pipeline.py'): | |
""" | |
This function runs a TPOT classifier on the dataset, after splitting into | |
a training and test set, and then oversampling the training set. | |
Args: | |
dataframe: pandas dataframe. Master dataframe containing the feature and target | |
data |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.ensemble import ExtraTreesClassifier | |
from sklearn.model_selection import train_test_split | |
from sklearn.naive_bayes import BernoulliNB | |
from sklearn.pipeline import make_pipeline, make_union | |
from sklearn.preprocessing import RobustScaler | |
from tpot.builtins import StackingEstimator | |
# NOTE: Make sure that the class is labeled 'target' in the data file |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
results_rounded = np.round(results) | |
confusion_matrix(testing_target, results_rounded) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None) | |
#Declare the column names of the data set | |
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', | |
'marital-status', 'occupation', 'relationship', | |
'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', | |
'native-country', 'salary'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
def generate_distribution_histogram(dataframe, | |
column_name, | |
title, x_axis_label, y_axis_label, | |
label_name, | |
number_bins = 15): | |
""" | |
This function generates a histogram. | |
Args: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Subset the data into salary categories | |
df_less_than_50k = df[df['salary'] == ' <=50K'] | |
df_greater_than_50k = df[df['salary'] == ' >50K'] | |
#Plot the histogram for the distribution for data <=$50K | |
generate_distribution_histogram(df_less_than_50k, 'age', | |
title = 'Age Distribution: US Population', | |
x_axis_label = 'Age (years)', | |
y_axis_label = 'Frequency', | |
label_name = '<=$50K') | |
#Plot the histogram for the distribution for data >$50K |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def mann_whitney_u_test(distribution_1, distribution_2): | |
""" | |
Perform the Mann-Whitney U Test, comparing two different distributions. | |
Args: | |
distribution_1: List. | |
distribution_2: List. | |
Outputs: | |
u_statistic: Float. U statisitic for the test. | |
p_value: Float. | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import eia | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
def retrieve_time_series(api, series_ID): | |
""" | |
Return the time series dataframe, based on API and unique Series ID | |
""" | |
#Retrieve Data By Series ID | |
series_search = api.data_by_series(series=series_ID) |