Last active
February 22, 2018 16:03
-
-
Save cmdelatorre/fd9ee43167f5cc1da130 to your computer and use it in GitHub Desktop.
Desicion tree classifier (ID3) algortihm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
http://en.wikipedia.org/wiki/ID3_algorithm | |
http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm | |
""" | |
from collections import namedtuple, Counter, defaultdict | |
from math import log2 | |
UNCLASSIFIED_VALUE = -99 | |
class ID3Classifier(object): | |
def __init__(self): | |
self.root_node = None | |
def fit(self, samples, target): | |
"""Create the decision tree.""" | |
training_samples = [TrainingSample(s, t) | |
for s, t in zip(samples, target)] | |
predicting_features = list(range(len(samples[0]))) | |
self.root_node = self.create_decision_tree(training_samples, | |
predicting_features) | |
def predict(self, X, allow_unclassified=False): | |
default_klass = 1 | |
if allow_unclassified: | |
default_klass = UNCLASSIFIED_VALUE | |
predicted_klasses = [] | |
for sample in X: | |
klass = None | |
current_node = self.root_node | |
while klass is None: | |
if current_node.is_leaf(): | |
klass = current_node.klass | |
else: | |
key_value = sample[current_node.feature] | |
if key_value in current_node: | |
current_node = current_node[key_value] | |
else: | |
# UNCLASSIFIED_VALUE | |
klass = default_klass | |
predicted_klasses.append(klass) | |
return predicted_klasses | |
def score(self, X, target, allow_unclassified=True): | |
predicted = self.predict(X, allow_unclassified=allow_unclassified) | |
n_matches = sum(p == t for p, t in zip(predicted, target)) | |
return 1.0 * n_matches / len(X) | |
def create_decision_tree(self, training_samples, predicting_features): | |
"""Recursively, create a desition tree and return the parent node.""" | |
if not predicting_features: | |
# No more predicting features | |
default_klass = self.get_most_common_class(training_samples) | |
root_node = DecisionTreeLeaf(default_klass) | |
else: | |
klasses = [sample.klass for sample in training_samples] | |
if len(set(klasses)) == 1: | |
target_klass = training_samples[0].klass | |
root_node = DecisionTreeLeaf(target_klass) | |
else: | |
best_feature = self.select_best_feature(training_samples, | |
predicting_features, | |
klasses) | |
# Create the node to return and create the sub-tree. | |
root_node = DecisionTreeNode(best_feature) | |
best_feature_values = {s.sample[best_feature] | |
for s in training_samples} | |
for value in best_feature_values: | |
samples = [s for s in training_samples | |
if s.sample[best_feature] == value] | |
# Recursively, create a child node. | |
child = self.create_decision_tree(samples, | |
predicting_features) | |
root_node[value] = child | |
return root_node | |
@staticmethod | |
def get_most_common_class(trainning_samples): | |
"""Return the most common class in the given samples.""" | |
klasses = [s.klass for s in trainning_samples] | |
counter = Counter(klasses) | |
k, = counter.most_common(n=1) | |
return k[0] | |
def select_best_feature(self, samples, features, klasses): | |
""" | |
Select, remove from list and return the feature that best classifies | |
the samples. | |
The best feature is the one with higher information gain for the | |
provided samples (mimizes entropy in klasses). | |
""" | |
gain_factors = [(self.information_gain(samples, feat, klasses), feat) | |
for feat in features] | |
gain_factors.sort() | |
best_feature = gain_factors[-1][1] | |
features.pop(features.index(best_feature)) | |
return best_feature | |
def information_gain(self, samples, feature, klasses): | |
""" | |
Information gain is the measure of the difference in entropy from before | |
to after the samples are split on the given feature values. In other | |
words, how much uncertainty in the samples was reduced after splitting | |
them on the given feature. | |
""" | |
N = len(samples) | |
samples_partition = defaultdict(list) | |
for s in samples: | |
samples_partition[s.sample[feature]].append(s) | |
feature_entropy = 0.0 | |
for partition in samples_partition.values(): | |
sub_klasses = [s.klass for s in partition] | |
feature_entropy += (len(partition) / N) * self.entropy(sub_klasses) | |
return self.entropy(klasses) - feature_entropy | |
@staticmethod | |
def entropy(dataset): | |
"""Measure of the amount of uncertainty in the given dataset.""" | |
N = len(dataset) | |
counter = Counter(dataset) | |
return sum(-1.0*(counter[k] / N)*log2(counter[k] / N) for k in counter) | |
TrainingSample = namedtuple('TrainingSample', ('sample', 'klass')) | |
class DecisionTreeNode(dict): | |
def __init__(self, feature, *args, **kwargs): | |
self.feature = feature | |
super(DecisionTreeNode, self).__init__(*args, **kwargs) | |
def is_leaf(self): | |
return False | |
def __repr__(self): | |
return "%s(%s)" % (self.__class__.__name__, self.feature) | |
class DecisionTreeLeaf(dict): | |
def __init__(self, klass, *args, **kwargs): | |
self.klass = klass | |
super(DecisionTreeLeaf, self).__init__(*args, **kwargs) | |
def is_leaf(self): | |
return True | |
def __repr__(self): | |
return "%s(%s)" % (self.__class__.__name__, self.klass) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment