Created
July 25, 2013 05:50
-
-
Save GarrisonJ/6077192 to your computer and use it in GitHub Desktop.
A simple Bayesian classifier written in python.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright (c) 2013, Garrison Jensen <[email protected]> | |
# All rights reserved. | |
# Licensed under the MIT license: http://www.opensource.org/licenses/mit-license.php | |
class PyBay: | |
TokBad = {} # Bad tokens | |
TokGood = {} # Good tokens | |
TotalTok = 0 # Count of total tokens | |
BadTokCount = 0 # Count of bad tokens | |
GoodCount = 0.0 # Count of good strings | |
BadCount = 0.0 # Count of bad strings | |
# Public: Train Bayesian filter with strings | |
# | |
# GoodorBad - "bad" will classifier the string as bad, and "good" will classify as good. | |
# Tokiens - String to be classified. | |
# | |
# Example | |
# train("good", "This is a good string") | |
# # => Will classify the string "This is a good string" as a good string. | |
# | |
# Returns null | |
def train(self, GoodorBad, Tokens): | |
Toks = Tokens.split() | |
if GoodorBad.lower() == "good": | |
for tok in Toks: | |
self.TokGood[tok] = self.TokGood.get(tok, 0) + 1 | |
self.TotalTok = self.TotalTok + 1 | |
self.GoodCount = self.GoodCount + 1 | |
elif GoodorBad.lower() == "bad": | |
for tok in Toks: | |
self.TokBad[tok] = self.TokBad.get(tok, 0) + 1 | |
self.TotalTok = self.TotalTok + 1 | |
self.BadTokCount = self.BadTokCount + 1 | |
self.BadCount = self.BadCount + 1 | |
else: | |
print "enter bad or good" | |
# Public: Give a percentage chance that the string is bad. | |
# | |
# a_string - String to be scored | |
# | |
# Example | |
# score("This string has a 50% chance of being bad") | |
# # => 0.5 | |
# | |
# Returns chance of being bad. | |
def score(self, a_string): | |
Tokiens = a_string.split() | |
p_total = 0.0 | |
p_of_bd_wth_wrd = 0.0 | |
p_of_bad = 0.0 | |
p_of_good = 0.0 | |
if self.BadCount != 0: # If badcount is zero probability is zero | |
p_of_good = float(self.GoodCount) / float(self.BadCount + self.GoodCount) | |
p_of_bad = float(self.BadCount) / float(self.BadCount + self.GoodCount) | |
for tok in Tokiens: | |
p_of_bd_wth_wrd = float(self.TokBad[tok]) / float(self.BadTokCount) | |
if (self.TokBad.get(tok, 0) + self.TokGood.get(tok, 0)) != 0: | |
p_of_wrd = float(self.TokBad.get(tok, 0) + self.TokGood.get(tok, 0)) / float(self.TotalTok) | |
else: | |
p_of_wrd = 0 | |
p_total = p_total + self.bayestheorem(p_of_bad, p_of_wrd, p_of_bd_wth_wrd) | |
return p_total / (len(Tokiens)) * 100 | |
# Public: Calculates chance of 'b' given 'a' | |
# | |
# p_a - Percentage chance of 'a' | |
# p_b - Percentage chance of 'b' | |
# p_b_a - Percentage chance of 'a' given 'b' | |
# | |
# Example | |
# bayestheorem('0.5', '0.5', '1') | |
# # => 1.0 | |
def bayestheorem(self, p_a, p_b, p_b_a): | |
if p_b > 0: | |
return (p_b_a * p_a) / p_b | |
else: | |
return 0.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment