Skip to content

Instantly share code, notes, and snippets.

View kperry2215's full-sized avatar

Kirsten Perry kperry2215

View GitHub Profile
import matplotlib.pyplot as plt
def generate_distribution_histogram(dataframe,
column_name,
title, x_axis_label, y_axis_label,
label_name,
number_bins = 15):
"""
This function generates a histogram.
Args:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)
#Declare the column names of the data set
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship',
'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
'native-country', 'salary']
results_rounded = np.round(results)
confusion_matrix(testing_target, results_rounded)
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import RobustScaler
from tpot.builtins import StackingEstimator
# NOTE: Make sure that the class is labeled 'target' in the data file
def run_tpot_automl(dataframe,
variable_to_predict,
number_generations,
file_to_export_pipeline_to = 'tpot_classifier_pipeline.py'):
"""
This function runs a TPOT classifier on the dataset, after splitting into
a training and test set, and then oversampling the training set.
Args:
dataframe: pandas dataframe. Master dataframe containing the feature and target
data
performance = aml.leader.model_performance(test)
print(performance)
def run_h2o_automl(dataframe, variable_to_predict,
max_number_models):
"""
This function initiates an h2o cluster, converts
the dataframe to an h2o dataframe, and then runs
the autoML function to generate a list of optimal
predictor models. The best models are displayed via a
scoreboard.
Arguments:
dataframe: Pandas dataframe.
#Read in the cancer data set
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data', header=None)
#Declare the column names of the cancer data set
df.columns=["Class", "Age", "Menopause",
"Tumor_Size", "Inv_Nodes",
"Node_Caps", "Deg_Malig",
"Breast", "Breast_quad",
"Irradiat"]
#Convert all of the categorical features variables to numeric (use LabelEncoder)
d = defaultdict(LabelEncoder)
def one_class_SVM_anomaly_detection(dataframe, columns_to_filter_by, outliers_fraction):
"""
In this definition, time series anomalies are detected
using a One Class SVM algorithm.
Arguments:
df: Pandas dataframe
columns_to_filter_by: string, or list of strings. Name of the column(s) that
we want to use in the One Class SVM to detect time series anomalies
outliers_fraction: float. Percentage of outliers allowed in the sequence.
Outputs:
def sesd_anomaly_detection(dataframe,
column_name,
desired_frequency,
max_anomalies,
alpha_level):
"""
In this definition, time series anomalies are detected using the S-ESD algorithm.
Arguments:
dataframe: Pandas dataframe
column_name: string. Name of the column that we want to detect anomalies in