Created
December 27, 2016 04:19
-
-
Save shengch02/950d914ccdb234ecc2c1f9644af0265f to your computer and use it in GitHub Desktop.
(Python) Use SFrames to do some feature engineering. Transform categorical variables into binary variables. Write a function to compute the number of misclassified examples in an intermediate node. Write a function to find the best feature to split on. Build a binary decision tree from scratch. Make predictions using the decision tree. Evaluate …
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#build decision trees where the data contain only binary features | |
import math | |
import pandas as pd | |
import numpy as np | |
#the dataset consists data from the LendingClub to predict whether a loan will be paid off in full or | |
#the loan with be charged off and possibly go into default | |
import sframe | |
loans = sframe.SFrame('lending-club-data.gl/') | |
#target column 'safe_loans' with +1 means a safe loan and -1 for risky loan | |
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x==0 else -1) | |
loans = loans.remove_column('bad_loans') | |
#use a subset of features (categorical and numeric) | |
features = ['grade', 'emp_length', 'home_ownership', 'term'] | |
target = 'safe_loans' | |
loans = loans[features+[target]] | |
#combat class imbalance: undersample the large class until the class distribution is half and half | |
safe_loans_raw = loans[loans[target]==+1] | |
risky_loans_raw = loans[loans[target]==-1] | |
percentage = len(risky_loans_raw)/float(len(safe_loans_raw)) | |
risky_loans = risky_loans_raw | |
safe_loans = safe_loans_raw.sample(percentage, seed=1) | |
loans_data = risky_loans.append(safe_loans) | |
#one-hot encoding: turn categorical variables into binary features | |
categorical_variables = [] | |
for feat_name, feat_type in zip(loans_data.column_names(), loans_data.column_types()): | |
if feat_type == str: | |
categorical_variables.append(feat_name) | |
for feature in categorical_variables: | |
loans_data_one_hot_encoded = loans_data[feature].apply(lambda x: {x:1}) | |
loans_data_unpacked = loans_data_one_hot_encoded.unpack(column_name_prefix=feature) | |
for column in loans_data_unpacked.column_names(): | |
loans_data_unpacked[column] = loans_data_unpacked[column].fillna(0) | |
loans_data.remove_column(feature) | |
loans_data.add_columns(loans_data_unpacked) | |
#training-test split | |
train_data, test_data = loans_data.random_split(0.8, seed=1) | |
#implement binary decision trees from scratch | |
#compute the number of misclassified examples | |
def intermediate_node_num_mistakes(labels_in_node): | |
if len(labels_in_node) == 0: | |
return 0 | |
else: | |
num_pos = sum(labels_in_node==1) | |
num_neg = sum(labels_in_node==-1) | |
return min(num_pos, num_neg) | |
#test of the intermediate_node_num_mistage() function | |
example_labels = sframe.SArray([-1, -1, 1, 1, 1]) | |
if intermediate_node_num_mistakes(example_labels) == 2: | |
print 'Test passed:' | |
else: | |
print 'Test failed... try again' | |
example_labels = sframe.SArray([-1,-1,-1, 1, 1]) | |
if intermediate_node_num_mistakes(example_labels) == 2: | |
print 'Test passed' | |
else: | |
print 'Test failed... try again' | |
#function to pick best feature to split on | |
def best_splitting_feature(data, features, target): | |
target_values = data[target] | |
best_feature = None | |
best_error = 10 | |
num_data_points = float(len(data)) | |
for feature in features: | |
left_split = data[data[feature]==0] | |
right_split = data[data[feature]==1] | |
left_mistakes = intermediate_node_num_mistakes(left_split[target]) | |
right_mistakes = intermediate_node_num_mistakes(right_split[target]) | |
error = (left_mistakes+right_mistakes)/num_data_points | |
if error < best_error: | |
best_error = error | |
best_feature = feature | |
return best_feature | |
#a function to create a leaf node | |
def create_leaf(target_values): | |
leaf = {'splitting_feature' : None, | |
'left':None, | |
'right':None, | |
'is_leaf':True | |
} | |
num_ones = len(target_values[target_values==1]) | |
num_minus_ones = len(target_values[target_values==-1]) | |
if num_ones > num_minus_ones: | |
leaf['prediction'] = 1 | |
else: | |
leaf['prediction'] = -1 | |
return leaf | |
#build the tree | |
def decision_tree_create(data, features, target, current_depth=0, max_depth=10): | |
remaining_features = features[:] | |
target_values = data[target] | |
print '-----------------------------------------------------' | |
print 'Subtree, depth = %s (%s data points).' %(current_depth, len(target_values)) | |
#stop condition 1, all points are from the same class | |
if intermediate_node_num_mistakes(target_values)==0: | |
print 'Stopping condition 1 reached' | |
return create_leaf(target_values) | |
#stop conditoin 2, no more features to split on | |
if remaining_features==None: | |
print 'Stopping condition 2 reached' | |
return create_leaf(target_values) | |
#stop condition 3, reach the maximum depth | |
if current_depth>=max_depth: | |
print 'Reached maximum depth. Stopping for now' | |
return create_leaf(target_values) | |
splitting_feature = best_splitting_feature(data, features, target) | |
left_split = data[data[splitting_feature]==0] | |
right_split = data[data[splitting_feature]==1] | |
remaining_features.remove(splitting_feature) | |
print 'Split on feature %s. (%s, %s)' % (splitting_feature, \ | |
len(left_split), len(right_split)) | |
if len(left_split)==len(data): | |
print 'Create leaf node' | |
return create_leaf(left_split[target]) | |
if len(right_split)==len(data): | |
print 'Create leaf node' | |
return create_leaf(right_split[target]) | |
#repeat (recurse) on left and right subtrees | |
left_tree = decision_tree_create(left_split, remaining_features, target, \ | |
current_depth+1, max_depth) | |
right_tree = decision_tree_create(right_split, remaining_features, target, \ | |
current_depth+1, max_depth) | |
return { 'is_leaf' : False, | |
'prediction' : None, | |
'splitting_feature': splitting_feature, | |
'left' : left_tree, | |
'right': right_tree} | |
features = train_data.column_names() | |
features.remove(target) | |
my_decision_tree = decision_tree_create(train_data, features, target, current_depth=0, max_depth=6) | |
#make predictions | |
def classify(tree, x, annotate=False): | |
if tree['is_leaf']: | |
if annotate: | |
print 'At leaf, predicting %s' % tree['prediction'] | |
return tree['prediction'] | |
else: | |
split_feature_value = x[tree['splitting_feature']] | |
if annotate: | |
print 'Split on %s = %s' % (tree['splitting_feature'], | |
split_feature_value) | |
if split_feature_value==0: | |
return classify(tree['left'], x, annotate) | |
else: | |
return classify(tree['right'], x, annotate) | |
#prediction | |
print test_data[0] | |
print 'Predicted class: %s' % classify(my_decision_tree, test_data[0], annotate=True) | |
#evaluate the decision tree | |
def evaluate_classification_error(tree, data): | |
prediction = data.apply(lambda x: classify(tree, x)) | |
return float(sum(data['safe_loans']!=prediction))/len(data) | |
print(evaluate_classification_error(my_decision_tree, test_data)) | |
#print out a single decision stump | |
def print_stump(tree, name='root'): | |
split_name = tree['splitting_feature'] | |
if split_name is None: | |
print '(leaf, lable: %s)' % tree['prediction'] | |
return None | |
split_feature, split_value = split_name.split('.') | |
print ' [{0} == 0] [{0} == 1] '.format(split_name) | |
print ' (%s) (%s)' \ | |
% (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'), | |
('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree')) | |
print_stump(my_decision_tree) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment