jeremystan · July 7, 2021 15:09
diff --git a/decision_tree_root_cause.py b/decision_tree_root_cause.py
 import graphviz
 import pandas as pd
 import numpy as np
 import sklearn.impute
 import sklearn.tree

 # Load the data from disk
 df_good = pd.read_csv("~/Downloads/good_data_sample.csv")
 df_bad = pd.read_csv("~/Downloads/bad_data_sample.csv")

 # Create a binary outcome response
 df = pd.concat([df_good, df_bad])

 # Create a feature matrix
 X = pd.get_dummies(df)

 # Create a response variable
 zeros = np.repeat(0, len(df_good))
 ones = np.repeat(1, len(df_bad))
 Y = np.concatenate([zeros, ones])

 # Impute missing feature values with their mean
 imp = sklearn.impute.SimpleImputer()
 imp.fit(X)
 X_imputed = imp.transform(X)

 # Fit a decision tree
 random_state = 2
 model = sklearn.tree.DecisionTreeClassifier(
    random_state=random_state, min_samples_leaf=100)
 model.fit(X_imputed, Y)

 # Visualize the decision tree
 dot_data = sklearn.tree.export_graphviz(
    model, out_file=None, feature_names=list(X.columns),  
    class_names=["good data", "bad data"],
    filled=True, rounded=True)  
 graphviz.Source(dot_data)
	import graphviz
	import pandas as pd
	import numpy as np
	import sklearn.impute
	import sklearn.tree

	# Load the data from disk
	df_good = pd.read_csv("~/Downloads/good_data_sample.csv")
	df_bad = pd.read_csv("~/Downloads/bad_data_sample.csv")

	# Create a binary outcome response
	df = pd.concat([df_good, df_bad])

	# Create a feature matrix
	X = pd.get_dummies(df)

	# Create a response variable
	zeros = np.repeat(0, len(df_good))
	ones = np.repeat(1, len(df_bad))
	Y = np.concatenate([zeros, ones])

	# Impute missing feature values with their mean
	imp = sklearn.impute.SimpleImputer()
	imp.fit(X)
	X_imputed = imp.transform(X)

	# Fit a decision tree
	random_state = 2
	model = sklearn.tree.DecisionTreeClassifier(
	random_state=random_state, min_samples_leaf=100)
	model.fit(X_imputed, Y)

	# Visualize the decision tree
	dot_data = sklearn.tree.export_graphviz(
	model, out_file=None, feature_names=list(X.columns),
	class_names=["good data", "bad data"],
	filled=True, rounded=True)
	graphviz.Source(dot_data)