Skip to content

Instantly share code, notes, and snippets.

@idmillington
Created November 21, 2011 01:10
Show Gist options
  • Save idmillington/1381339 to your computer and use it in GitHub Desktop.
Save idmillington/1381339 to your computer and use it in GitHub Desktop.
Simple NaiveBayes Classifier
import collections
import re
import math
class NaiveBayesClassifier:
"""
A simple naive bayes classifier that can classify any items into
any number of categories based on training data. The core
algorithms in this class are generic, but `split_into_features` is
specific for analysing words in a message, and the data functions
(`train` and `get_*`) store their data in memory. All of these can
be overridden if those assumptions aren't good.
"""
def __init__(self):
self.feature_category_counts = collections.defaultdict(int)
self.feature_counts = collections.defaultdict(int)
self.features_in_category_counts = collections.defaultdict(int)
self.items_in_category_counts = collections.defaultdict(int)
self.item_count = 0
# ## Feature detection
# Override this function for non-text applications.
def split_into_features(self, item):
"""
Splits the given item (a string) into a set of features (its
component words).
"""
splitter = re.compile(r'\W*')
return set([
word.lower()
for word in splitter.split(item)
if len(word)>2 and len(word)<20
])
# ## Data handling
# Override these functions to store data in another format.
def train(self, item, category):
"""
Trains the classifier to put the given item into the given
category. This splits the item into its features, and then
registers that each feature has been observed.
"""
features = self.split_into_features(item)
# Train ourself that each feature is associated with the given
# category.
for feature in features:
# Note that, although we could potentially pull the same
# feature out of the features list multiple times here,
# the math in this class assumes that features only appear
# once for an item: i.e. features is a unique set, not an
# ordered list.
self.feature_category_counts[(feature, category)] += 1
self.feature_counts[feature] += 1
# Record that we've seen this category.
self.features_in_category_counts[category] += len(features)
self.items_in_category_counts[category] += 1
self.item_count += 1
def get_feature_category_count(self, feature, category):
"""
Returns the number of times the given feature has been
observed in the given category.
"""
return self.feature_category_counts[(feature, category)]
def get_feature_count(self, feature):
"""
Returns the number of times the given feature has been
observed.
"""
return self.feature_counts[feature]
def get_number_of_features(self):
"""
Returns the total number of features.
"""
return len(self.feature_counts)
def get_number_of_features_in_category(self, category):
"""
Returns the total number of times the given category has been
observed on individual features.
"""
return self.features_in_category_counts[category]
def get_number_of_items_in_category(self, category):
"""
Returns the total number of times the given category has been
observed on individual items.
"""
return self.items_in_category_counts[category]
def get_number_of_items(self):
"""
Returns the total number of observations made.
"""
return self.item_count
def get_categories(self):
"""
Returns a list of all the categories that have been observed.
"""
return self.items_in_category_counts.keys()
# ## Probability calculations
# This should be the same for all storage and feature detection
# approaches.
def get_p_of_category_given_features(self, features, category):
"""
Returns the probability of the document (made up of the given
features) belonging to the given category: generated as a
product of its weighted feature probabilities.
"""
category_count = self.get_number_of_features_in_category(category)
num_features_seen = self.get_number_of_features()
num_items = self.get_number_of_items()
num_features_here = len(features)
# Find the probability p(document|category), which is the
# product of all the p(feature|category) for component
# features. Also find the probability p(document), which is
# the probability of seeing the given features in a document,
# again the product of p(feature). We work in logarithms to
# avoid underflow issues, and skip denominators until the next step.
lpdc = 0.0
lpd = 0.0
for feature in features:
lpdc += math.log(
self.get_feature_category_count(feature, category) + 1.0
)
lpd += math.log(self.get_feature_count(feature) + 1.0)
# Rather than dividing the probabilities out in the loop
# above, we do so now.
lpdc_denom = math.log(category_count + num_features_seen)
lpdc -= lpdc_denom * num_features_here
# This is optional. Without it we get output values that are
# proportional to the probabilities, but not the actual
# probabilities. This is fine for raw comparisons, but proper
# probabilities are easier to understand.
lpd_denom = math.log(num_items + num_features_seen)
lpd -= lpd_denom * num_features_here
# Find the log of the prior probability of seeing this category.
lpc = math.log(
float(self.get_number_of_items_in_category(category)) /
float(self.get_number_of_items())
)
# Bayes rule gives
# p(category|document) = p(document|category) * p(category) / p(feature)
return math.exp(lpdc + lpc - lpd)
def classify(self, document):
"""
Returns a list of decreasing probabilities and the categories
they correspond to, for the given document.
"""
# Split into features once and for all.
features = self.split_into_features(document)
# Find probabilities associated with each category.
results = []
for category in self.get_categories():
p = self.get_p_of_category_given_features(features, category)
results.append((p, category))
# Sort so the highest probability is first.
results.sort(reverse=True)
return results
if __name__ == '__main__':
# An example of the class in use. In lieu of unit-tests.
cl = NaiveBayesClassifier()
cl.train('apple pear','v')
cl.train('apple orange','v')
cl.train('apple lemon','v')
cl.train('apple turkey pork','not-v')
print cl.classify('turkey, apple sauce, pork sausage')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment