Last active
February 21, 2019 18:35
-
-
Save SeijiEmery/abd16192d4444f02b8ea7edd5f5e5e68 to your computer and use it in GitHub Desktop.
from-scratch decision trees (basic example from an AI lecture)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Discrete decision tree impl, pretty basic. (works for this example) | |
No libraries :) | |
Copyright (C) Seiji Emery 2018 | |
""" | |
# Setup our dataset | |
data = { | |
'Day': [ 'D%s'%i for i in range(1, 15) ], | |
'Outlook': [ {'s':'Sunny','o':'Overcast','r':'Rain'}[i] for i in [ | |
's','s','o','r','r','r','o','s','s','r','s','o','o','r' | |
]], | |
'Humidity': [ {'h':'High','n':'Normal' }[i] for i in [ | |
'h','h','h','h','n','n','n','h','n','n','n','h','n','h' | |
]], | |
'Wind': [ {'w':'Weak','s':'Strong'}[i] for i in [ | |
'w','s','w','w','w','s','s','w','w','w','s','s','w','s' | |
]], | |
'Play': [ {'y':'Yes','n':'No'}[i] for i in [ | |
'n','n','y','y','y','n','y','n','y','y','y','y','y','n' | |
]] | |
} | |
def print_data (data): | |
""" Prints dataset as a table """ | |
print('\t'.join(data.keys())) | |
for values in zip(*list(data.values())): | |
print('\t'.join(values)) | |
def feature_entropy (data, feature): | |
""" Calculates entropy of a feature vector. | |
Result is on [0, 1], with 0 = no entropy, 1 = max entropy | |
""" | |
feature_counts = {} | |
for fv in data[feature]: | |
if fv not in feature_counts: | |
feature_counts[fv] = 1 | |
else: | |
feature_counts[fv] += 1 | |
return 1.0 - float(max(feature_counts.values())) / len(data[feature]) | |
def cross_entropy (data, feature, target_feature): | |
""" Calculates cross entropy between two features. | |
Returns a dict of { | |
feature_value => feature_entropy | |
for all feature_values in data[feature] | |
} | |
Each entropy value is on [0, 1] with 0 = no entropy, 1 = max entropy | |
""" | |
feature_counts_by_target_feature = {} | |
for x, y in zip(data[feature], data[target_feature]): | |
if x not in feature_counts_by_target_feature: | |
feature_counts_by_target_feature[x] = {} | |
if y not in feature_counts_by_target_feature[x]: | |
feature_counts_by_target_feature[x][y] = 1 | |
else: | |
feature_counts_by_target_feature[x][y] += 1 | |
return { | |
feature_category: 1.0 - float(max(value_counts.values())) / sum(value_counts.values()) | |
for feature_category, value_counts in feature_counts_by_target_feature.items() | |
} | |
def min_cross_entropy (data, feature, target_feature): | |
""" Basic heuristic to compare two features. | |
cross_entropy gives you a set of entropy values mapped to the domain of the data's feature elements; | |
taking the minima of these is a good metric for how splitting off this feature can at best | |
reduce entropy / reduce entropy for at least one of the split data sets. | |
Result is on [0, 1], with 0 = no entropy, 1 = max entropy. | |
""" | |
return min(cross_entropy(data, feature, target_feature).values()) | |
def feature_selection_weight (data, feature, target_feature): | |
""" calculates min_cross_entropy and returns that AND the size of the feature domain. | |
Returns min_cross_entropy, len(feature_domain). | |
Used to implement choose_best_feature(...) | |
""" | |
feature_counts_by_target_feature = {} | |
for x, y in zip(data[feature], data[target_feature]): | |
if x not in feature_counts_by_target_feature: | |
feature_counts_by_target_feature[x] = {} | |
if y not in feature_counts_by_target_feature[x]: | |
feature_counts_by_target_feature[x][y] = 1 | |
else: | |
feature_counts_by_target_feature[x][y] += 1 | |
min_cross_entropy = min([ | |
1.0 - float(max(value_counts.values())) / sum(value_counts.values()) | |
for value_counts in feature_counts_by_target_feature.values() | |
]) | |
num_feature_categories = len(feature_counts_by_target_feature.keys()) | |
return min_cross_entropy, num_feature_categories | |
def choose_best_feature_to_split_at (data, features, target_feature): | |
""" calculates an improved heuristic across ALL features and returns the best feature according to this heuristic. | |
heuristic is | |
min_cross_entropy + 1 / (feature_domain_size normalized across all features) | |
in short, this is min_cross_entropy, plus a tie-breaking heuristic that attempts to minimize | |
decision tree breadth (by prioritizing features with smaller feature domains, | |
ie choose 'Humidity' (domain 2) over 'Overcast' (domain 3) over 'Days' (domain 15) | |
mostly, this is just to avoid picking 'Days' period, but works fairly well in practice. | |
Heuristic is on [0, 2], with lower value => better. | |
Returns the first feature sorted by this heuristic (ie. lowest) | |
""" | |
feature_weights = { feature: feature_selection_weight(data, feature, target_feature) for feature in features } | |
max_feature_categories = max([ count for _, count in feature_weights.values() ]) | |
weighted_features = [ | |
(inverse_min_cross_entropy + float(feature_category_count) / max_feature_categories, feature) | |
for feature, (inverse_min_cross_entropy, feature_category_count) in feature_weights.items() | |
] | |
return sorted(weighted_features)[0][1] | |
def feature_domain (data, feature): | |
""" calculates feature domain. | |
As it happens, this can be implemented as set(values) as we are dealing with discrete, | |
not continuous values. | |
""" | |
return set(data[feature]) | |
def build_decision_tree (data, target_feature): | |
""" Builds a decision tree off of a discrete dataset and a target feature. | |
Returns a set of nested dictionaries, with | |
choice := | |
{ feature-value: choice } | | |
outcome | |
where outcomes are on the domain of target_feature, and feature-value are on the domain | |
of features, with one feature used per level, at most once. | |
This is a very basic representation, but is human readable and could be altered as desired. | |
""" | |
def build_tree (data, features, target_feature): | |
""" Helper function """ | |
# End when we hit zero entropy | |
if feature_entropy(data, target_feature) == 0: | |
# With zero entropy, target_feature vector should all have the same | |
# values, so can just return the first of those | |
return data[target_feature][0] | |
# non-zero entropy => split into datasets with lower entropy | |
# find the best feature to split from (and its domain) | |
split_feature = choose_best_feature_to_split_at(data, features, target_feature) | |
split_feature_domain = feature_domain(data, split_feature) | |
# remove this feature from our set of considered features | |
features -= set([ split_feature ]) | |
# partition our dataset on the split feature: | |
split_data = { | |
# map each value on the split feature domain, | |
feature_value: { | |
# to all features => feature vectors in the dataset, | |
feature: [ | |
# but filtered to only include values that are part of this feature partition | |
x for i, x in enumerate(values) | |
if data[split_feature][i] == feature_value | |
] | |
for feature, values in data.items() | |
} | |
for feature_value in split_feature_domain | |
} | |
# recursively apply to each partitioned dataset | |
# returns a map of feature choice => decision tree | |
return { | |
feature: build_tree(data, features, target_feature) | |
for feature, data in split_data.items() | |
} | |
# set features = set(data keys - target_feature), and invoke recursively | |
return build_tree(data, set(data.keys()) - set([ target_feature ]), target_feature) | |
if __name__ == '__main__': | |
print("data:") | |
print_data(data) | |
print() | |
target_feature = 'Play' | |
feature_weights = { feature: feature_selection_weight(data, feature, target_feature) for feature in data.keys() } | |
max_feature_categories = max([ count for _, count in feature_weights.values() ]) | |
real_feature_weights = { | |
feature: inverse_min_cross_entropy + float(feature_category_count) / max_feature_categories | |
for feature, (inverse_min_cross_entropy, feature_category_count) in feature_weights.items() | |
} | |
print("entropy heuristics for each feature in relation to '%s' (lower is better):"%target_feature) | |
for feature, weight in real_feature_weights.items(): | |
# print(" selection weight, raw entropy, min cross entropy of feature '%s' with '%s' in dataset = %0.2f, %0.2f, %0.2f: %s"%( | |
print("selection weight, min cross entropy of feature '%s' with '%s' in dataset = %0.2f, %0.2f (entropy values: %s)"%( | |
feature, | |
target_feature, | |
weight, | |
# feature_entropy(data, feature), | |
min_cross_entropy(data, feature, target_feature), | |
cross_entropy(data, feature, target_feature) | |
)) | |
print("""Target feature = '{feature}'. Best feature to split at? | |
including '{feature}': '{incl}' | |
excluding '{feature}': '{excl}' | |
""".format( | |
feature=target_feature, | |
incl=choose_best_feature_to_split_at(data, data.keys(), target_feature), | |
excl=choose_best_feature_to_split_at(data, set(data.keys()) - set([ target_feature ]), target_feature) | |
)) | |
print("Decision tree: %s"%build_decision_tree(data, target_feature)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
data: | |
Day Outlook Humidity Wind Play | |
D1 Sunny High Weak No | |
D2 Sunny High Strong No | |
D3 Overcast High Weak Yes | |
D4 Rain High Weak Yes | |
D5 Rain Normal Weak Yes | |
D6 Rain Normal Strong No | |
D7 Overcast Normal Strong Yes | |
D8 Sunny High Weak No | |
D9 Sunny Normal Weak Yes | |
D10 Rain Normal Weak Yes | |
D11 Sunny Normal Strong Yes | |
D12 Overcast High Strong Yes | |
D13 Overcast Normal Weak Yes | |
D14 Rain High Strong No | |
entropy heuristics for each feature in relation to 'Play' (lower is better): | |
selection weight, min cross entropy of feature 'Day' with 'Play' in dataset = 1.00, 0.00 (entropy values: {'D1': 0.0, 'D2': 0.0, 'D3': 0.0, 'D4': 0.0, 'D5': 0.0, 'D6': 0.0, 'D7': 0.0, 'D8': 0.0, 'D9': 0.0, 'D10': 0.0, 'D11': 0.0, 'D12': 0.0, 'D13': 0.0, 'D14': 0.0}) | |
selection weight, min cross entropy of feature 'Outlook' with 'Play' in dataset = 0.21, 0.00 (entropy values: {'Sunny': 0.4, 'Overcast': 0.0, 'Rain': 0.4}) | |
selection weight, min cross entropy of feature 'Humidity' with 'Play' in dataset = 0.29, 0.14 (entropy values: {'High': 0.4285714285714286, 'Normal': 0.1428571428571429}) | |
selection weight, min cross entropy of feature 'Wind' with 'Play' in dataset = 0.39, 0.25 (entropy values: {'Weak': 0.25, 'Strong': 0.5}) | |
selection weight, min cross entropy of feature 'Play' with 'Play' in dataset = 0.14, 0.00 (entropy values: {'No': 0.0, 'Yes': 0.0}) | |
Target feature = 'Play'. Best feature to split at? | |
including 'Play': 'Play' | |
excluding 'Play': 'Outlook' | |
Decision tree: {'Overcast': 'Yes', 'Rain': {'Strong': 'No', 'Weak': 'Yes'}, 'Sunny': {'Normal': 'Yes', 'High': 'No'}} | |
[Finished in 0.6s] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment