Created
July 7, 2021 15:09
-
-
Save jeremystan/58e637a3bff50949f36095eb0f4ff3b1 to your computer and use it in GitHub Desktop.
Decision Tree Root Cause Analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import graphviz | |
import pandas as pd | |
import numpy as np | |
import sklearn.impute | |
import sklearn.tree | |
# Load the data from disk | |
df_good = pd.read_csv("~/Downloads/good_data_sample.csv") | |
df_bad = pd.read_csv("~/Downloads/bad_data_sample.csv") | |
# Create a binary outcome response | |
df = pd.concat([df_good, df_bad]) | |
# Create a feature matrix | |
X = pd.get_dummies(df) | |
# Create a response variable | |
zeros = np.repeat(0, len(df_good)) | |
ones = np.repeat(1, len(df_bad)) | |
Y = np.concatenate([zeros, ones]) | |
# Impute missing feature values with their mean | |
imp = sklearn.impute.SimpleImputer() | |
imp.fit(X) | |
X_imputed = imp.transform(X) | |
# Fit a decision tree | |
random_state = 2 | |
model = sklearn.tree.DecisionTreeClassifier( | |
random_state=random_state, min_samples_leaf=100) | |
model.fit(X_imputed, Y) | |
# Visualize the decision tree | |
dot_data = sklearn.tree.export_graphviz( | |
model, out_file=None, feature_names=list(X.columns), | |
class_names=["good data", "bad data"], | |
filled=True, rounded=True) | |
graphviz.Source(dot_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment