Skip to content

Instantly share code, notes, and snippets.

@SeijiEmery
Last active February 21, 2019 18:35
Show Gist options
  • Save SeijiEmery/abd16192d4444f02b8ea7edd5f5e5e68 to your computer and use it in GitHub Desktop.
Save SeijiEmery/abd16192d4444f02b8ea7edd5f5e5e68 to your computer and use it in GitHub Desktop.
from-scratch decision trees (basic example from an AI lecture)
"""
Discrete decision tree impl, pretty basic. (works for this example)
No libraries :)
Copyright (C) Seiji Emery 2018
"""
# Setup our dataset
data = {
'Day': [ 'D%s'%i for i in range(1, 15) ],
'Outlook': [ {'s':'Sunny','o':'Overcast','r':'Rain'}[i] for i in [
's','s','o','r','r','r','o','s','s','r','s','o','o','r'
]],
'Humidity': [ {'h':'High','n':'Normal' }[i] for i in [
'h','h','h','h','n','n','n','h','n','n','n','h','n','h'
]],
'Wind': [ {'w':'Weak','s':'Strong'}[i] for i in [
'w','s','w','w','w','s','s','w','w','w','s','s','w','s'
]],
'Play': [ {'y':'Yes','n':'No'}[i] for i in [
'n','n','y','y','y','n','y','n','y','y','y','y','y','n'
]]
}
def print_data (data):
""" Prints dataset as a table """
print('\t'.join(data.keys()))
for values in zip(*list(data.values())):
print('\t'.join(values))
def feature_entropy (data, feature):
""" Calculates entropy of a feature vector.
Result is on [0, 1], with 0 = no entropy, 1 = max entropy
"""
feature_counts = {}
for fv in data[feature]:
if fv not in feature_counts:
feature_counts[fv] = 1
else:
feature_counts[fv] += 1
return 1.0 - float(max(feature_counts.values())) / len(data[feature])
def cross_entropy (data, feature, target_feature):
""" Calculates cross entropy between two features.
Returns a dict of {
feature_value => feature_entropy
for all feature_values in data[feature]
}
Each entropy value is on [0, 1] with 0 = no entropy, 1 = max entropy
"""
feature_counts_by_target_feature = {}
for x, y in zip(data[feature], data[target_feature]):
if x not in feature_counts_by_target_feature:
feature_counts_by_target_feature[x] = {}
if y not in feature_counts_by_target_feature[x]:
feature_counts_by_target_feature[x][y] = 1
else:
feature_counts_by_target_feature[x][y] += 1
return {
feature_category: 1.0 - float(max(value_counts.values())) / sum(value_counts.values())
for feature_category, value_counts in feature_counts_by_target_feature.items()
}
def min_cross_entropy (data, feature, target_feature):
""" Basic heuristic to compare two features.
cross_entropy gives you a set of entropy values mapped to the domain of the data's feature elements;
taking the minima of these is a good metric for how splitting off this feature can at best
reduce entropy / reduce entropy for at least one of the split data sets.
Result is on [0, 1], with 0 = no entropy, 1 = max entropy.
"""
return min(cross_entropy(data, feature, target_feature).values())
def feature_selection_weight (data, feature, target_feature):
""" calculates min_cross_entropy and returns that AND the size of the feature domain.
Returns min_cross_entropy, len(feature_domain).
Used to implement choose_best_feature(...)
"""
feature_counts_by_target_feature = {}
for x, y in zip(data[feature], data[target_feature]):
if x not in feature_counts_by_target_feature:
feature_counts_by_target_feature[x] = {}
if y not in feature_counts_by_target_feature[x]:
feature_counts_by_target_feature[x][y] = 1
else:
feature_counts_by_target_feature[x][y] += 1
min_cross_entropy = min([
1.0 - float(max(value_counts.values())) / sum(value_counts.values())
for value_counts in feature_counts_by_target_feature.values()
])
num_feature_categories = len(feature_counts_by_target_feature.keys())
return min_cross_entropy, num_feature_categories
def choose_best_feature_to_split_at (data, features, target_feature):
""" calculates an improved heuristic across ALL features and returns the best feature according to this heuristic.
heuristic is
min_cross_entropy + 1 / (feature_domain_size normalized across all features)
in short, this is min_cross_entropy, plus a tie-breaking heuristic that attempts to minimize
decision tree breadth (by prioritizing features with smaller feature domains,
ie choose 'Humidity' (domain 2) over 'Overcast' (domain 3) over 'Days' (domain 15)
mostly, this is just to avoid picking 'Days' period, but works fairly well in practice.
Heuristic is on [0, 2], with lower value => better.
Returns the first feature sorted by this heuristic (ie. lowest)
"""
feature_weights = { feature: feature_selection_weight(data, feature, target_feature) for feature in features }
max_feature_categories = max([ count for _, count in feature_weights.values() ])
weighted_features = [
(inverse_min_cross_entropy + float(feature_category_count) / max_feature_categories, feature)
for feature, (inverse_min_cross_entropy, feature_category_count) in feature_weights.items()
]
return sorted(weighted_features)[0][1]
def feature_domain (data, feature):
""" calculates feature domain.
As it happens, this can be implemented as set(values) as we are dealing with discrete,
not continuous values.
"""
return set(data[feature])
def build_decision_tree (data, target_feature):
""" Builds a decision tree off of a discrete dataset and a target feature.
Returns a set of nested dictionaries, with
choice :=
{ feature-value: choice } |
outcome
where outcomes are on the domain of target_feature, and feature-value are on the domain
of features, with one feature used per level, at most once.
This is a very basic representation, but is human readable and could be altered as desired.
"""
def build_tree (data, features, target_feature):
""" Helper function """
# End when we hit zero entropy
if feature_entropy(data, target_feature) == 0:
# With zero entropy, target_feature vector should all have the same
# values, so can just return the first of those
return data[target_feature][0]
# non-zero entropy => split into datasets with lower entropy
# find the best feature to split from (and its domain)
split_feature = choose_best_feature_to_split_at(data, features, target_feature)
split_feature_domain = feature_domain(data, split_feature)
# remove this feature from our set of considered features
features -= set([ split_feature ])
# partition our dataset on the split feature:
split_data = {
# map each value on the split feature domain,
feature_value: {
# to all features => feature vectors in the dataset,
feature: [
# but filtered to only include values that are part of this feature partition
x for i, x in enumerate(values)
if data[split_feature][i] == feature_value
]
for feature, values in data.items()
}
for feature_value in split_feature_domain
}
# recursively apply to each partitioned dataset
# returns a map of feature choice => decision tree
return {
feature: build_tree(data, features, target_feature)
for feature, data in split_data.items()
}
# set features = set(data keys - target_feature), and invoke recursively
return build_tree(data, set(data.keys()) - set([ target_feature ]), target_feature)
if __name__ == '__main__':
print("data:")
print_data(data)
print()
target_feature = 'Play'
feature_weights = { feature: feature_selection_weight(data, feature, target_feature) for feature in data.keys() }
max_feature_categories = max([ count for _, count in feature_weights.values() ])
real_feature_weights = {
feature: inverse_min_cross_entropy + float(feature_category_count) / max_feature_categories
for feature, (inverse_min_cross_entropy, feature_category_count) in feature_weights.items()
}
print("entropy heuristics for each feature in relation to '%s' (lower is better):"%target_feature)
for feature, weight in real_feature_weights.items():
# print(" selection weight, raw entropy, min cross entropy of feature '%s' with '%s' in dataset = %0.2f, %0.2f, %0.2f: %s"%(
print("selection weight, min cross entropy of feature '%s' with '%s' in dataset = %0.2f, %0.2f (entropy values: %s)"%(
feature,
target_feature,
weight,
# feature_entropy(data, feature),
min_cross_entropy(data, feature, target_feature),
cross_entropy(data, feature, target_feature)
))
print("""Target feature = '{feature}'. Best feature to split at?
including '{feature}': '{incl}'
excluding '{feature}': '{excl}'
""".format(
feature=target_feature,
incl=choose_best_feature_to_split_at(data, data.keys(), target_feature),
excl=choose_best_feature_to_split_at(data, set(data.keys()) - set([ target_feature ]), target_feature)
))
print("Decision tree: %s"%build_decision_tree(data, target_feature))
data:
Day Outlook Humidity Wind Play
D1 Sunny High Weak No
D2 Sunny High Strong No
D3 Overcast High Weak Yes
D4 Rain High Weak Yes
D5 Rain Normal Weak Yes
D6 Rain Normal Strong No
D7 Overcast Normal Strong Yes
D8 Sunny High Weak No
D9 Sunny Normal Weak Yes
D10 Rain Normal Weak Yes
D11 Sunny Normal Strong Yes
D12 Overcast High Strong Yes
D13 Overcast Normal Weak Yes
D14 Rain High Strong No
entropy heuristics for each feature in relation to 'Play' (lower is better):
selection weight, min cross entropy of feature 'Day' with 'Play' in dataset = 1.00, 0.00 (entropy values: {'D1': 0.0, 'D2': 0.0, 'D3': 0.0, 'D4': 0.0, 'D5': 0.0, 'D6': 0.0, 'D7': 0.0, 'D8': 0.0, 'D9': 0.0, 'D10': 0.0, 'D11': 0.0, 'D12': 0.0, 'D13': 0.0, 'D14': 0.0})
selection weight, min cross entropy of feature 'Outlook' with 'Play' in dataset = 0.21, 0.00 (entropy values: {'Sunny': 0.4, 'Overcast': 0.0, 'Rain': 0.4})
selection weight, min cross entropy of feature 'Humidity' with 'Play' in dataset = 0.29, 0.14 (entropy values: {'High': 0.4285714285714286, 'Normal': 0.1428571428571429})
selection weight, min cross entropy of feature 'Wind' with 'Play' in dataset = 0.39, 0.25 (entropy values: {'Weak': 0.25, 'Strong': 0.5})
selection weight, min cross entropy of feature 'Play' with 'Play' in dataset = 0.14, 0.00 (entropy values: {'No': 0.0, 'Yes': 0.0})
Target feature = 'Play'. Best feature to split at?
including 'Play': 'Play'
excluding 'Play': 'Outlook'
Decision tree: {'Overcast': 'Yes', 'Rain': {'Strong': 'No', 'Weak': 'Yes'}, 'Sunny': {'Normal': 'Yes', 'High': 'No'}}
[Finished in 0.6s]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment