SeijiEmery · February 21, 2019 18:35
diff --git a/learn_decision_tree.py b/learn_decision_tree.py
 """
    Discrete decision tree impl, pretty basic. (works for this example)
    No libraries :)

    Copyright (C) Seiji Emery 2018
 """

 # Setup our dataset
 data = {
    'Day': [ 'D%s'%i for i in range(1, 15) ],
    'Outlook': [ {'s':'Sunny','o':'Overcast','r':'Rain'}[i] for i in [
        's','s','o','r','r','r','o','s','s','r','s','o','o','r'
    ]],
    'Humidity': [ {'h':'High','n':'Normal' }[i] for i in [
        'h','h','h','h','n','n','n','h','n','n','n','h','n','h'
    ]],
    'Wind': [ {'w':'Weak','s':'Strong'}[i] for i in [
        'w','s','w','w','w','s','s','w','w','w','s','s','w','s'
    ]],
    'Play': [ {'y':'Yes','n':'No'}[i] for i in [
        'n','n','y','y','y','n','y','n','y','y','y','y','y','n'
    ]]
 }
 def print_data (data):
    """ Prints dataset as a table """
    print('\t'.join(data.keys()))
    for values in zip(*list(data.values())):
        print('\t'.join(values))

 def feature_entropy (data, feature):
    """ Calculates entropy of a feature vector. 
    Result is on [0, 1], with 0 = no entropy, 1 = max entropy
    """
    feature_counts = {}
    for fv in data[feature]:
        if fv not in feature_counts:
            feature_counts[fv] = 1
        else:
            feature_counts[fv] += 1
    return 1.0 - float(max(feature_counts.values())) / len(data[feature])

 def cross_entropy (data, feature, target_feature):
    """ Calculates cross entropy between two features.
    
    Returns a dict of {
        feature_value => feature_entropy
        for all feature_values in data[feature]
    }
    Each entropy value is on [0, 1] with 0 = no entropy, 1 = max entropy
    """
    feature_counts_by_target_feature = {}
    for x, y in zip(data[feature], data[target_feature]):
        if x not in feature_counts_by_target_feature:
            feature_counts_by_target_feature[x] = {}
        if y not in feature_counts_by_target_feature[x]:
            feature_counts_by_target_feature[x][y] = 1
        else:
            feature_counts_by_target_feature[x][y] += 1
    return {
        feature_category: 1.0 - float(max(value_counts.values())) / sum(value_counts.values())
        for feature_category, value_counts in feature_counts_by_target_feature.items()
    }

 def min_cross_entropy (data, feature, target_feature):
    """ Basic heuristic to compare two features.
    cross_entropy gives you a set of entropy values mapped to the domain of the data's feature elements;
    taking the minima of these is a good metric for how splitting off this feature can at best 
    reduce entropy / reduce entropy for at least one of the split data sets.

    Result is on [0, 1], with 0 = no entropy, 1 = max entropy.
    """
    return min(cross_entropy(data, feature, target_feature).values())

 def feature_selection_weight (data, feature, target_feature):
    """ calculates min_cross_entropy and returns that AND the size of the feature domain.

    Returns min_cross_entropy, len(feature_domain).

    Used to implement choose_best_feature(...)
    """
    feature_counts_by_target_feature = {}
    for x, y in zip(data[feature], data[target_feature]):
        if x not in feature_counts_by_target_feature:
            feature_counts_by_target_feature[x] = {}
        if y not in feature_counts_by_target_feature[x]:
            feature_counts_by_target_feature[x][y] = 1
        else:
            feature_counts_by_target_feature[x][y] += 1

    min_cross_entropy = min([
        1.0 - float(max(value_counts.values())) / sum(value_counts.values())
        for value_counts in feature_counts_by_target_feature.values()
    ])
    num_feature_categories = len(feature_counts_by_target_feature.keys())
    return min_cross_entropy, num_feature_categories

 def choose_best_feature_to_split_at (data, features, target_feature):
    """ calculates an improved heuristic across ALL features and returns the best feature according to this heuristic.

    heuristic is

        min_cross_entropy + 1 / (feature_domain_size normalized across all features)

    in short, this is min_cross_entropy, plus a tie-breaking heuristic that attempts to minimize
        decision tree breadth (by prioritizing features with smaller feature domains,
        ie choose 'Humidity' (domain 2) over 'Overcast' (domain 3) over 'Days' (domain 15)

    mostly, this is just to avoid picking 'Days' period, but works fairly well in practice.

    Heuristic is on [0, 2], with lower value => better.
    Returns the first feature sorted by this heuristic (ie. lowest)
    """
    feature_weights = { feature: feature_selection_weight(data, feature, target_feature) for feature in features }
    max_feature_categories = max([ count for _, count in feature_weights.values() ])
    weighted_features = [
        (inverse_min_cross_entropy + float(feature_category_count) / max_feature_categories, feature)
        for feature, (inverse_min_cross_entropy, feature_category_count) in feature_weights.items()
    ]
    return sorted(weighted_features)[0][1]

 def feature_domain (data, feature):
    """ calculates feature domain. 
    As it happens, this can be implemented as set(values) as we are dealing with discrete,
    not continuous values.
    """
    return set(data[feature])

 def build_decision_tree (data, target_feature):
    """ Builds a decision tree off of a discrete dataset and a target feature.

    Returns a set of nested dictionaries, with

    choice := 
        { feature-value: choice } |
        outcome

    where outcomes are on the domain of target_feature, and feature-value are on the domain
        of features, with one feature used per level, at most once.

    This is a very basic representation, but is human readable and could be altered as desired.
    """

    def build_tree (data, features, target_feature):
        """ Helper function """

        # End when we hit zero entropy
        if feature_entropy(data, target_feature) == 0:
            # With zero entropy, target_feature vector should all have the same
            # values, so can just return the first of those
            return data[target_feature][0]

        # non-zero entropy => split into datasets with lower entropy
        
        # find the best feature to split from (and its domain)
        split_feature = choose_best_feature_to_split_at(data, features, target_feature)
        split_feature_domain = feature_domain(data, split_feature)

        # remove this feature from our set of considered features
        features -= set([ split_feature ])

        # partition our dataset on the split feature:
        split_data = {
            # map each value on the split feature domain,
            feature_value: {
                # to all features => feature vectors in the dataset,
                feature: [
                    # but filtered to only include values that are part of this feature partition
                    x for i, x in enumerate(values)
                    if data[split_feature][i] == feature_value
                ]
                for feature, values in data.items()
            }
            for feature_value in split_feature_domain
        }

        # recursively apply to each partitioned dataset
        # returns a map of feature choice => decision tree
        return {
            feature: build_tree(data, features, target_feature)
            for feature, data in split_data.items()
        }

    # set features = set(data keys - target_feature), and invoke recursively
    return build_tree(data, set(data.keys()) - set([ target_feature ]), target_feature)


 if __name__ == '__main__':
    print("data:")
    print_data(data)
    print()

    target_feature = 'Play'
    feature_weights = { feature: feature_selection_weight(data, feature, target_feature) for feature in data.keys() }
    max_feature_categories = max([ count for _, count in feature_weights.values() ])
    real_feature_weights = {
        feature: inverse_min_cross_entropy + float(feature_category_count) / max_feature_categories
        for feature, (inverse_min_cross_entropy, feature_category_count) in feature_weights.items()
    } 

    print("entropy heuristics for each feature in relation to '%s' (lower is better):"%target_feature)
    for feature, weight in real_feature_weights.items():
        # print(" selection weight, raw entropy, min cross entropy of feature '%s' with '%s' in dataset = %0.2f, %0.2f, %0.2f: %s"%(
        print("selection weight, min cross entropy of feature '%s' with '%s' in dataset = %0.2f, %0.2f (entropy values: %s)"%(
            feature, 
            target_feature,
            weight,
            # feature_entropy(data, feature),
            min_cross_entropy(data, feature, target_feature),
            cross_entropy(data, feature, target_feature)
        ))

    print("""Target feature = '{feature}'. Best feature to split at?
        including '{feature}': '{incl}'
        excluding '{feature}': '{excl}'
    """.format(
        feature=target_feature,
        incl=choose_best_feature_to_split_at(data, data.keys(), target_feature),
        excl=choose_best_feature_to_split_at(data, set(data.keys()) - set([ target_feature ]), target_feature)
    ))
    print("Decision tree: %s"%build_decision_tree(data, target_feature))
diff --git a/output.txt b/output.txt
 data:
 Day Outlook Humidity    Wind    Play
 D1  Sunny   High    Weak    No
 D2  Sunny   High    Strong  No
 D3  Overcast    High    Weak    Yes
 D4  Rain    High    Weak    Yes
 D5  Rain    Normal  Weak    Yes
 D6  Rain    Normal  Strong  No
 D7  Overcast    Normal  Strong  Yes
 D8  Sunny   High    Weak    No
 D9  Sunny   Normal  Weak    Yes
 D10 Rain    Normal  Weak    Yes
 D11 Sunny   Normal  Strong  Yes
 D12 Overcast    High    Strong  Yes
 D13 Overcast    Normal  Weak    Yes
 D14 Rain    High    Strong  No

 entropy heuristics for each feature in relation to 'Play' (lower is better):
 selection weight, min cross entropy of feature 'Day' with 'Play' in dataset = 1.00, 0.00 (entropy values: {'D1': 0.0, 'D2': 0.0, 'D3': 0.0, 'D4': 0.0, 'D5': 0.0, 'D6': 0.0, 'D7': 0.0, 'D8': 0.0, 'D9': 0.0, 'D10': 0.0, 'D11': 0.0, 'D12': 0.0, 'D13': 0.0, 'D14': 0.0})
 selection weight, min cross entropy of feature 'Outlook' with 'Play' in dataset = 0.21, 0.00 (entropy values: {'Sunny': 0.4, 'Overcast': 0.0, 'Rain': 0.4})
 selection weight, min cross entropy of feature 'Humidity' with 'Play' in dataset = 0.29, 0.14 (entropy values: {'High': 0.4285714285714286, 'Normal': 0.1428571428571429})
 selection weight, min cross entropy of feature 'Wind' with 'Play' in dataset = 0.39, 0.25 (entropy values: {'Weak': 0.25, 'Strong': 0.5})
 selection weight, min cross entropy of feature 'Play' with 'Play' in dataset = 0.14, 0.00 (entropy values: {'No': 0.0, 'Yes': 0.0})
 Target feature = 'Play'. Best feature to split at?
        including 'Play': 'Play'
        excluding 'Play': 'Outlook'
    
 Decision tree: {'Overcast': 'Yes', 'Rain': {'Strong': 'No', 'Weak': 'Yes'}, 'Sunny': {'Normal': 'Yes', 'High': 'No'}}
 [Finished in 0.6s]
	"""
	Discrete decision tree impl, pretty basic. (works for this example)
	No libraries :)

	Copyright (C) Seiji Emery 2018
	"""

	# Setup our dataset
	data = {
	'Day': [ 'D%s'%i for i in range(1, 15) ],
	'Outlook': [ {'s':'Sunny','o':'Overcast','r':'Rain'}[i] for i in [
	's','s','o','r','r','r','o','s','s','r','s','o','o','r'
	]],
	'Humidity': [ {'h':'High','n':'Normal' }[i] for i in [
	'h','h','h','h','n','n','n','h','n','n','n','h','n','h'
	]],
	'Wind': [ {'w':'Weak','s':'Strong'}[i] for i in [
	'w','s','w','w','w','s','s','w','w','w','s','s','w','s'
	]],
	'Play': [ {'y':'Yes','n':'No'}[i] for i in [
	'n','n','y','y','y','n','y','n','y','y','y','y','y','n'
	]]
	}
	def print_data (data):
	""" Prints dataset as a table """
	print('\t'.join(data.keys()))
	for values in zip(*list(data.values())):
	print('\t'.join(values))

	def feature_entropy (data, feature):
	""" Calculates entropy of a feature vector.
	Result is on [0, 1], with 0 = no entropy, 1 = max entropy
	"""
	feature_counts = {}
	for fv in data[feature]:
	if fv not in feature_counts:
	feature_counts[fv] = 1
	else:
	feature_counts[fv] += 1
	return 1.0 - float(max(feature_counts.values())) / len(data[feature])

	def cross_entropy (data, feature, target_feature):
	""" Calculates cross entropy between two features.

	Returns a dict of {
	feature_value => feature_entropy
	for all feature_values in data[feature]
	}
	Each entropy value is on [0, 1] with 0 = no entropy, 1 = max entropy
	"""
	feature_counts_by_target_feature = {}
	for x, y in zip(data[feature], data[target_feature]):
	if x not in feature_counts_by_target_feature:
	feature_counts_by_target_feature[x] = {}
	if y not in feature_counts_by_target_feature[x]:
	feature_counts_by_target_feature[x][y] = 1
	else:
	feature_counts_by_target_feature[x][y] += 1
	return {
	feature_category: 1.0 - float(max(value_counts.values())) / sum(value_counts.values())
	for feature_category, value_counts in feature_counts_by_target_feature.items()
	}

	def min_cross_entropy (data, feature, target_feature):
	""" Basic heuristic to compare two features.
	cross_entropy gives you a set of entropy values mapped to the domain of the data's feature elements;
	taking the minima of these is a good metric for how splitting off this feature can at best
	reduce entropy / reduce entropy for at least one of the split data sets.

	Result is on [0, 1], with 0 = no entropy, 1 = max entropy.
	"""
	return min(cross_entropy(data, feature, target_feature).values())

	def feature_selection_weight (data, feature, target_feature):
	""" calculates min_cross_entropy and returns that AND the size of the feature domain.

	Returns min_cross_entropy, len(feature_domain).

	Used to implement choose_best_feature(...)
	"""
	feature_counts_by_target_feature = {}
	for x, y in zip(data[feature], data[target_feature]):
	if x not in feature_counts_by_target_feature:
	feature_counts_by_target_feature[x] = {}
	if y not in feature_counts_by_target_feature[x]:
	feature_counts_by_target_feature[x][y] = 1
	else:
	feature_counts_by_target_feature[x][y] += 1

	min_cross_entropy = min([
	1.0 - float(max(value_counts.values())) / sum(value_counts.values())
	for value_counts in feature_counts_by_target_feature.values()
	])
	num_feature_categories = len(feature_counts_by_target_feature.keys())
	return min_cross_entropy, num_feature_categories

	def choose_best_feature_to_split_at (data, features, target_feature):
	""" calculates an improved heuristic across ALL features and returns the best feature according to this heuristic.

	heuristic is

	min_cross_entropy + 1 / (feature_domain_size normalized across all features)

	in short, this is min_cross_entropy, plus a tie-breaking heuristic that attempts to minimize
	decision tree breadth (by prioritizing features with smaller feature domains,
	ie choose 'Humidity' (domain 2) over 'Overcast' (domain 3) over 'Days' (domain 15)

	mostly, this is just to avoid picking 'Days' period, but works fairly well in practice.

	Heuristic is on [0, 2], with lower value => better.
	Returns the first feature sorted by this heuristic (ie. lowest)
	"""
	feature_weights = { feature: feature_selection_weight(data, feature, target_feature) for feature in features }
	max_feature_categories = max([ count for _, count in feature_weights.values() ])
	weighted_features = [
	(inverse_min_cross_entropy + float(feature_category_count) / max_feature_categories, feature)
	for feature, (inverse_min_cross_entropy, feature_category_count) in feature_weights.items()
	]
	return sorted(weighted_features)[0][1]

	def feature_domain (data, feature):
	""" calculates feature domain.
	As it happens, this can be implemented as set(values) as we are dealing with discrete,
	not continuous values.
	"""
	return set(data[feature])

	def build_decision_tree (data, target_feature):
	""" Builds a decision tree off of a discrete dataset and a target feature.

	Returns a set of nested dictionaries, with

	choice :=
	{ feature-value: choice } \|
	outcome

	where outcomes are on the domain of target_feature, and feature-value are on the domain
	of features, with one feature used per level, at most once.

	This is a very basic representation, but is human readable and could be altered as desired.
	"""

	def build_tree (data, features, target_feature):
	""" Helper function """

	# End when we hit zero entropy
	if feature_entropy(data, target_feature) == 0:
	# With zero entropy, target_feature vector should all have the same
	# values, so can just return the first of those
	return data[target_feature][0]

	# non-zero entropy => split into datasets with lower entropy

	# find the best feature to split from (and its domain)
	split_feature = choose_best_feature_to_split_at(data, features, target_feature)
	split_feature_domain = feature_domain(data, split_feature)

	# remove this feature from our set of considered features
	features -= set([ split_feature ])

	# partition our dataset on the split feature:
	split_data = {
	# map each value on the split feature domain,
	feature_value: {
	# to all features => feature vectors in the dataset,
	feature: [
	# but filtered to only include values that are part of this feature partition
	x for i, x in enumerate(values)
	if data[split_feature][i] == feature_value
	]
	for feature, values in data.items()
	}
	for feature_value in split_feature_domain
	}

	# recursively apply to each partitioned dataset
	# returns a map of feature choice => decision tree
	return {
	feature: build_tree(data, features, target_feature)
	for feature, data in split_data.items()
	}

	# set features = set(data keys - target_feature), and invoke recursively
	return build_tree(data, set(data.keys()) - set([ target_feature ]), target_feature)


	if __name__ == '__main__':
	print("data:")
	print_data(data)
	print()

	target_feature = 'Play'
	feature_weights = { feature: feature_selection_weight(data, feature, target_feature) for feature in data.keys() }
	max_feature_categories = max([ count for _, count in feature_weights.values() ])
	real_feature_weights = {
	feature: inverse_min_cross_entropy + float(feature_category_count) / max_feature_categories
	for feature, (inverse_min_cross_entropy, feature_category_count) in feature_weights.items()
	}

	print("entropy heuristics for each feature in relation to '%s' (lower is better):"%target_feature)
	for feature, weight in real_feature_weights.items():
	# print(" selection weight, raw entropy, min cross entropy of feature '%s' with '%s' in dataset = %0.2f, %0.2f, %0.2f: %s"%(
	print("selection weight, min cross entropy of feature '%s' with '%s' in dataset = %0.2f, %0.2f (entropy values: %s)"%(
	feature,
	target_feature,
	weight,
	# feature_entropy(data, feature),
	min_cross_entropy(data, feature, target_feature),
	cross_entropy(data, feature, target_feature)
	))

	print("""Target feature = '{feature}'. Best feature to split at?
	including '{feature}': '{incl}'
	excluding '{feature}': '{excl}'
	""".format(
	feature=target_feature,
	incl=choose_best_feature_to_split_at(data, data.keys(), target_feature),
	excl=choose_best_feature_to_split_at(data, set(data.keys()) - set([ target_feature ]), target_feature)
	))
	print("Decision tree: %s"%build_decision_tree(data, target_feature))
	data:
	Day Outlook Humidity Wind Play
	D1 Sunny High Weak No
	D2 Sunny High Strong No
	D3 Overcast High Weak Yes
	D4 Rain High Weak Yes
	D5 Rain Normal Weak Yes
	D6 Rain Normal Strong No
	D7 Overcast Normal Strong Yes
	D8 Sunny High Weak No
	D9 Sunny Normal Weak Yes
	D10 Rain Normal Weak Yes
	D11 Sunny Normal Strong Yes
	D12 Overcast High Strong Yes
	D13 Overcast Normal Weak Yes
	D14 Rain High Strong No

	entropy heuristics for each feature in relation to 'Play' (lower is better):
	selection weight, min cross entropy of feature 'Day' with 'Play' in dataset = 1.00, 0.00 (entropy values: {'D1': 0.0, 'D2': 0.0, 'D3': 0.0, 'D4': 0.0, 'D5': 0.0, 'D6': 0.0, 'D7': 0.0, 'D8': 0.0, 'D9': 0.0, 'D10': 0.0, 'D11': 0.0, 'D12': 0.0, 'D13': 0.0, 'D14': 0.0})
	selection weight, min cross entropy of feature 'Outlook' with 'Play' in dataset = 0.21, 0.00 (entropy values: {'Sunny': 0.4, 'Overcast': 0.0, 'Rain': 0.4})
	selection weight, min cross entropy of feature 'Humidity' with 'Play' in dataset = 0.29, 0.14 (entropy values: {'High': 0.4285714285714286, 'Normal': 0.1428571428571429})
	selection weight, min cross entropy of feature 'Wind' with 'Play' in dataset = 0.39, 0.25 (entropy values: {'Weak': 0.25, 'Strong': 0.5})
	selection weight, min cross entropy of feature 'Play' with 'Play' in dataset = 0.14, 0.00 (entropy values: {'No': 0.0, 'Yes': 0.0})
	Target feature = 'Play'. Best feature to split at?
	including 'Play': 'Play'
	excluding 'Play': 'Outlook'

	Decision tree: {'Overcast': 'Yes', 'Rain': {'Strong': 'No', 'Weak': 'Yes'}, 'Sunny': {'Normal': 'Yes', 'High': 'No'}}
	[Finished in 0.6s]