Created
November 21, 2011 01:10
-
-
Save idmillington/1381339 to your computer and use it in GitHub Desktop.
Simple NaiveBayes Classifier
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import collections | |
import re | |
import math | |
class NaiveBayesClassifier: | |
""" | |
A simple naive bayes classifier that can classify any items into | |
any number of categories based on training data. The core | |
algorithms in this class are generic, but `split_into_features` is | |
specific for analysing words in a message, and the data functions | |
(`train` and `get_*`) store their data in memory. All of these can | |
be overridden if those assumptions aren't good. | |
""" | |
def __init__(self): | |
self.feature_category_counts = collections.defaultdict(int) | |
self.feature_counts = collections.defaultdict(int) | |
self.features_in_category_counts = collections.defaultdict(int) | |
self.items_in_category_counts = collections.defaultdict(int) | |
self.item_count = 0 | |
# ## Feature detection | |
# Override this function for non-text applications. | |
def split_into_features(self, item): | |
""" | |
Splits the given item (a string) into a set of features (its | |
component words). | |
""" | |
splitter = re.compile(r'\W*') | |
return set([ | |
word.lower() | |
for word in splitter.split(item) | |
if len(word)>2 and len(word)<20 | |
]) | |
# ## Data handling | |
# Override these functions to store data in another format. | |
def train(self, item, category): | |
""" | |
Trains the classifier to put the given item into the given | |
category. This splits the item into its features, and then | |
registers that each feature has been observed. | |
""" | |
features = self.split_into_features(item) | |
# Train ourself that each feature is associated with the given | |
# category. | |
for feature in features: | |
# Note that, although we could potentially pull the same | |
# feature out of the features list multiple times here, | |
# the math in this class assumes that features only appear | |
# once for an item: i.e. features is a unique set, not an | |
# ordered list. | |
self.feature_category_counts[(feature, category)] += 1 | |
self.feature_counts[feature] += 1 | |
# Record that we've seen this category. | |
self.features_in_category_counts[category] += len(features) | |
self.items_in_category_counts[category] += 1 | |
self.item_count += 1 | |
def get_feature_category_count(self, feature, category): | |
""" | |
Returns the number of times the given feature has been | |
observed in the given category. | |
""" | |
return self.feature_category_counts[(feature, category)] | |
def get_feature_count(self, feature): | |
""" | |
Returns the number of times the given feature has been | |
observed. | |
""" | |
return self.feature_counts[feature] | |
def get_number_of_features(self): | |
""" | |
Returns the total number of features. | |
""" | |
return len(self.feature_counts) | |
def get_number_of_features_in_category(self, category): | |
""" | |
Returns the total number of times the given category has been | |
observed on individual features. | |
""" | |
return self.features_in_category_counts[category] | |
def get_number_of_items_in_category(self, category): | |
""" | |
Returns the total number of times the given category has been | |
observed on individual items. | |
""" | |
return self.items_in_category_counts[category] | |
def get_number_of_items(self): | |
""" | |
Returns the total number of observations made. | |
""" | |
return self.item_count | |
def get_categories(self): | |
""" | |
Returns a list of all the categories that have been observed. | |
""" | |
return self.items_in_category_counts.keys() | |
# ## Probability calculations | |
# This should be the same for all storage and feature detection | |
# approaches. | |
def get_p_of_category_given_features(self, features, category): | |
""" | |
Returns the probability of the document (made up of the given | |
features) belonging to the given category: generated as a | |
product of its weighted feature probabilities. | |
""" | |
category_count = self.get_number_of_features_in_category(category) | |
num_features_seen = self.get_number_of_features() | |
num_items = self.get_number_of_items() | |
num_features_here = len(features) | |
# Find the probability p(document|category), which is the | |
# product of all the p(feature|category) for component | |
# features. Also find the probability p(document), which is | |
# the probability of seeing the given features in a document, | |
# again the product of p(feature). We work in logarithms to | |
# avoid underflow issues, and skip denominators until the next step. | |
lpdc = 0.0 | |
lpd = 0.0 | |
for feature in features: | |
lpdc += math.log( | |
self.get_feature_category_count(feature, category) + 1.0 | |
) | |
lpd += math.log(self.get_feature_count(feature) + 1.0) | |
# Rather than dividing the probabilities out in the loop | |
# above, we do so now. | |
lpdc_denom = math.log(category_count + num_features_seen) | |
lpdc -= lpdc_denom * num_features_here | |
# This is optional. Without it we get output values that are | |
# proportional to the probabilities, but not the actual | |
# probabilities. This is fine for raw comparisons, but proper | |
# probabilities are easier to understand. | |
lpd_denom = math.log(num_items + num_features_seen) | |
lpd -= lpd_denom * num_features_here | |
# Find the log of the prior probability of seeing this category. | |
lpc = math.log( | |
float(self.get_number_of_items_in_category(category)) / | |
float(self.get_number_of_items()) | |
) | |
# Bayes rule gives | |
# p(category|document) = p(document|category) * p(category) / p(feature) | |
return math.exp(lpdc + lpc - lpd) | |
def classify(self, document): | |
""" | |
Returns a list of decreasing probabilities and the categories | |
they correspond to, for the given document. | |
""" | |
# Split into features once and for all. | |
features = self.split_into_features(document) | |
# Find probabilities associated with each category. | |
results = [] | |
for category in self.get_categories(): | |
p = self.get_p_of_category_given_features(features, category) | |
results.append((p, category)) | |
# Sort so the highest probability is first. | |
results.sort(reverse=True) | |
return results | |
if __name__ == '__main__': | |
# An example of the class in use. In lieu of unit-tests. | |
cl = NaiveBayesClassifier() | |
cl.train('apple pear','v') | |
cl.train('apple orange','v') | |
cl.train('apple lemon','v') | |
cl.train('apple turkey pork','not-v') | |
print cl.classify('turkey, apple sauce, pork sausage') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment