idmillington · November 21, 2011 01:10
diff --git a/naivebayes.py b/naivebayes.py
 import collections
 import re
 import math

 class NaiveBayesClassifier:
    """
    A simple naive bayes classifier that can classify any items into
    any number of categories based on training data. The core
    algorithms in this class are generic, but `split_into_features` is
    specific for analysing words in a message, and the data functions
    (`train` and `get_*`) store their data in memory. All of these can
    be overridden if those assumptions aren't good.
    """

    def __init__(self):
        self.feature_category_counts = collections.defaultdict(int)
        self.feature_counts = collections.defaultdict(int)
        self.features_in_category_counts = collections.defaultdict(int)
        self.items_in_category_counts = collections.defaultdict(int)
        self.item_count = 0

    # ## Feature detection
    # Override this function for non-text applications.

    def split_into_features(self, item):
        """
        Splits the given item (a string) into a set of features (its
        component words).
        """
        splitter = re.compile(r'\W*')
        return set([
                word.lower()
                for word in splitter.split(item)
                if len(word)>2 and len(word)<20
                ])

    # ## Data handling
    # Override these functions to store data in another format.

    def train(self, item, category):
        """
        Trains the classifier to put the given item into the given
        category. This splits the item into its features, and then
        registers that each feature has been observed.
        """
        features = self.split_into_features(item)

        # Train ourself that each feature is associated with the given
        # category.
        for feature in features:
            # Note that, although we could potentially pull the same
            # feature out of the features list multiple times here,
            # the math in this class assumes that features only appear
            # once for an item: i.e. features is a unique set, not an
            # ordered list.
            self.feature_category_counts[(feature, category)] += 1
            self.feature_counts[feature] += 1

        # Record that we've seen this category.
        self.features_in_category_counts[category] += len(features)
        self.items_in_category_counts[category] += 1
        self.item_count += 1

    def get_feature_category_count(self, feature, category):
        """
        Returns the number of times the given feature has been
        observed in the given category.
        """
        return self.feature_category_counts[(feature, category)]

    def get_feature_count(self, feature):
        """
        Returns the number of times the given feature has been
        observed.
        """
        return self.feature_counts[feature]

    def get_number_of_features(self):
        """
        Returns the total number of features.
        """
        return len(self.feature_counts)

    def get_number_of_features_in_category(self, category):
        """
        Returns the total number of times the given category has been
        observed on individual features.
        """
        return self.features_in_category_counts[category]

    def get_number_of_items_in_category(self, category):
        """
        Returns the total number of times the given category has been
        observed on individual items.
        """
        return self.items_in_category_counts[category]

    def get_number_of_items(self):
        """
        Returns the total number of observations made.
        """
        return self.item_count

    def get_categories(self):
        """
        Returns a list of all the categories that have been observed.
        """
        return self.items_in_category_counts.keys()

    # ## Probability calculations
    # This should be the same for all storage and feature detection
    # approaches.

    def get_p_of_category_given_features(self, features, category):
        """
        Returns the probability of the document (made up of the given
        features) belonging to the given category: generated as a
        product of its weighted feature probabilities.
        """
        category_count = self.get_number_of_features_in_category(category)
        num_features_seen = self.get_number_of_features()
        num_items = self.get_number_of_items()
        num_features_here = len(features)

        # Find the probability p(document|category), which is the
        # product of all the p(feature|category) for component
        # features. Also find the probability p(document), which is
        # the probability of seeing the given features in a document,
        # again the product of p(feature). We work in logarithms to
        # avoid underflow issues, and skip denominators until the next step.
        lpdc = 0.0
        lpd = 0.0
        for feature in features:
            lpdc +=  math.log(
                self.get_feature_category_count(feature, category) + 1.0
                )
            lpd += math.log(self.get_feature_count(feature) + 1.0)

        # Rather than dividing the probabilities out in the loop
        # above, we do so now.
        lpdc_denom = math.log(category_count + num_features_seen)
        lpdc -= lpdc_denom * num_features_here

        # This is optional. Without it we get output values that are
        # proportional to the probabilities, but not the actual
        # probabilities. This is fine for raw comparisons, but proper
        # probabilities are easier to understand.
        lpd_denom = math.log(num_items + num_features_seen)
        lpd -= lpd_denom * num_features_here

        # Find the log of the prior probability of seeing this category.
        lpc = math.log(
            float(self.get_number_of_items_in_category(category)) /
            float(self.get_number_of_items())
            )

        # Bayes rule gives
        # p(category|document) = p(document|category) * p(category) / p(feature)
        return math.exp(lpdc + lpc - lpd)

    def classify(self, document):
        """
        Returns a list of decreasing probabilities and the categories
        they correspond to, for the given document.
        """
        # Split into features once and for all.
        features = self.split_into_features(document)

        # Find probabilities associated with each category.
        results = []
        for category in self.get_categories():
            p = self.get_p_of_category_given_features(features, category)
            results.append((p, category))

        # Sort so the highest probability is first.
        results.sort(reverse=True)
        return results

 if __name__ == '__main__':
    # An example of the class in use. In lieu of unit-tests.
    cl = NaiveBayesClassifier()
    cl.train('apple pear','v')
    cl.train('apple orange','v')
    cl.train('apple lemon','v')
    cl.train('apple turkey pork','not-v')
    print cl.classify('turkey, apple sauce, pork sausage')
	import collections
	import re
	import math

	class NaiveBayesClassifier:
	"""
	A simple naive bayes classifier that can classify any items into
	any number of categories based on training data. The core
	algorithms in this class are generic, but `split_into_features` is
	specific for analysing words in a message, and the data functions
	(`train` and `get_*`) store their data in memory. All of these can
	be overridden if those assumptions aren't good.
	"""

	def __init__(self):
	self.feature_category_counts = collections.defaultdict(int)
	self.feature_counts = collections.defaultdict(int)
	self.features_in_category_counts = collections.defaultdict(int)
	self.items_in_category_counts = collections.defaultdict(int)
	self.item_count = 0

	# ## Feature detection
	# Override this function for non-text applications.

	def split_into_features(self, item):
	"""
	Splits the given item (a string) into a set of features (its
	component words).
	"""
	splitter = re.compile(r'\W*')
	return set([
	word.lower()
	for word in splitter.split(item)
	if len(word)>2 and len(word)<20
	])

	# ## Data handling
	# Override these functions to store data in another format.

	def train(self, item, category):
	"""
	Trains the classifier to put the given item into the given
	category. This splits the item into its features, and then
	registers that each feature has been observed.
	"""
	features = self.split_into_features(item)

	# Train ourself that each feature is associated with the given
	# category.
	for feature in features:
	# Note that, although we could potentially pull the same
	# feature out of the features list multiple times here,
	# the math in this class assumes that features only appear
	# once for an item: i.e. features is a unique set, not an
	# ordered list.
	self.feature_category_counts[(feature, category)] += 1
	self.feature_counts[feature] += 1

	# Record that we've seen this category.
	self.features_in_category_counts[category] += len(features)
	self.items_in_category_counts[category] += 1
	self.item_count += 1

	def get_feature_category_count(self, feature, category):
	"""
	Returns the number of times the given feature has been
	observed in the given category.
	"""
	return self.feature_category_counts[(feature, category)]

	def get_feature_count(self, feature):
	"""
	Returns the number of times the given feature has been
	observed.
	"""
	return self.feature_counts[feature]

	def get_number_of_features(self):
	"""
	Returns the total number of features.
	"""
	return len(self.feature_counts)

	def get_number_of_features_in_category(self, category):
	"""
	Returns the total number of times the given category has been
	observed on individual features.
	"""
	return self.features_in_category_counts[category]

	def get_number_of_items_in_category(self, category):
	"""
	Returns the total number of times the given category has been
	observed on individual items.
	"""
	return self.items_in_category_counts[category]

	def get_number_of_items(self):
	"""
	Returns the total number of observations made.
	"""
	return self.item_count

	def get_categories(self):
	"""
	Returns a list of all the categories that have been observed.
	"""
	return self.items_in_category_counts.keys()

	# ## Probability calculations
	# This should be the same for all storage and feature detection
	# approaches.

	def get_p_of_category_given_features(self, features, category):
	"""
	Returns the probability of the document (made up of the given
	features) belonging to the given category: generated as a
	product of its weighted feature probabilities.
	"""
	category_count = self.get_number_of_features_in_category(category)
	num_features_seen = self.get_number_of_features()
	num_items = self.get_number_of_items()
	num_features_here = len(features)

	# Find the probability p(document\|category), which is the
	# product of all the p(feature\|category) for component
	# features. Also find the probability p(document), which is
	# the probability of seeing the given features in a document,
	# again the product of p(feature). We work in logarithms to
	# avoid underflow issues, and skip denominators until the next step.
	lpdc = 0.0
	lpd = 0.0
	for feature in features:
	lpdc += math.log(
	self.get_feature_category_count(feature, category) + 1.0
	)
	lpd += math.log(self.get_feature_count(feature) + 1.0)

	# Rather than dividing the probabilities out in the loop
	# above, we do so now.
	lpdc_denom = math.log(category_count + num_features_seen)
	lpdc -= lpdc_denom * num_features_here

	# This is optional. Without it we get output values that are
	# proportional to the probabilities, but not the actual
	# probabilities. This is fine for raw comparisons, but proper
	# probabilities are easier to understand.
	lpd_denom = math.log(num_items + num_features_seen)
	lpd -= lpd_denom * num_features_here

	# Find the log of the prior probability of seeing this category.
	lpc = math.log(
	float(self.get_number_of_items_in_category(category)) /
	float(self.get_number_of_items())
	)

	# Bayes rule gives
	# p(category\|document) = p(document\|category) * p(category) / p(feature)
	return math.exp(lpdc + lpc - lpd)

	def classify(self, document):
	"""
	Returns a list of decreasing probabilities and the categories
	they correspond to, for the given document.
	"""
	# Split into features once and for all.
	features = self.split_into_features(document)

	# Find probabilities associated with each category.
	results = []
	for category in self.get_categories():
	p = self.get_p_of_category_given_features(features, category)
	results.append((p, category))

	# Sort so the highest probability is first.
	results.sort(reverse=True)
	return results

	if __name__ == '__main__':
	# An example of the class in use. In lieu of unit-tests.
	cl = NaiveBayesClassifier()
	cl.train('apple pear','v')
	cl.train('apple orange','v')
	cl.train('apple lemon','v')
	cl.train('apple turkey pork','not-v')
	print cl.classify('turkey, apple sauce, pork sausage')