Created
June 27, 2014 20:43
-
-
Save TurplePurtle/a1a86bc3365eb2e557d9 to your computer and use it in GitHub Desktop.
Bayes Classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'fast_stemmer' | |
require 'set' | |
class BayesClassifier | |
def initialize(categories) | |
raise "categories must be an Array" unless categories.is_a? Array | |
raise "At least 2 categories are needed" if categories.size < 2 | |
@categories = categories.clone.freeze | |
@inv_num_cats = 1.0 / @categories.size | |
@count_template = @categories.reduce({}) {|h,c| h[c] = 0; h } | |
@word_counts = {} | |
@doc_counts = @count_template.clone | |
end | |
def train(category, string) | |
raise "Unknown category" unless @doc_counts.has_key? category | |
@doc_counts[category] += 1 | |
tokenize(string).each do |w| | |
@word_counts[w] ||= @count_template.clone | |
@word_counts[w][category] += 1 | |
end | |
end | |
alias_method :train!, :train | |
def classified_as?(category, string) | |
classify(string) == category | |
end | |
def classify(string) | |
scores = category_scores(string) | |
@categories[scores.each_with_index.max[1]] | |
end | |
def category_scores(string) | |
word_scores = tokenize(string).map do |w| | |
if @word_counts.has_key? w | |
word_count = @word_counts[w] | |
prob = @categories.map {|c| word_count[c].to_f / @doc_counts[c] } | |
norm = prob.reduce {|n,p| n + p } | |
word_total_count = word_count.values.reduce {|n,c| n + c } | |
prob.map! {|p| (@inv_num_cats + word_total_count * p / norm) / (1 + word_total_count) } | |
else | |
nil # ignore unknown words | |
end | |
end | |
word_scores.compact! | |
return Array.new(@categories.size, @inv_num_cats) if word_scores.empty? | |
scores = Array.new(@categories.size, 1.0) | |
word_scores.each {|prob| prob.each_with_index {|p,i| scores[i] *= p } } | |
score_sum = scores.reduce {|s,p| s + p } | |
scores.map! {|s| s / score_sum } | |
end | |
private | |
def tokenize(text, in_place=false) | |
text = text.readlines if text.is_a? File | |
case text | |
when String | |
text = text.clone unless in_place | |
text = fix_string(text) | |
when Array | |
if in_place | |
text.map! {|ln| fix_string(ln) } | |
else | |
text.map! {|ln| fix_string(ln.clone) } | |
end | |
text = text.join | |
else | |
raise StandardError, "Text must be a String or array of Strings." | |
end | |
text = text.split(" ") | |
text.reject! {|w| STOP_WORDS.member?(w) } | |
text.map! {|w| w.stem } | |
end | |
def fix_string(s) | |
unless s.valid_encoding? | |
s.encode!('ascii', invalid: :replace, replace: '<<bad>>', undef: :replace) | |
s.encode!('UTF-8') | |
s.gsub!(/\S*<<bad>>\S*/, ' ') | |
end | |
s.gsub!(/[\.\n>]/, " ") # replace "." and "\n" with " " | |
s.downcase! | |
s.gsub!(/[^a-z ]+/, "") # only keep letters and spaces | |
s | |
end | |
STOP_WORDS = Set.new([ | |
'a', 'about', 'above', 'across', 'after', 'afterwards', | |
'again', 'against', 'all', 'almost', 'alone', 'along', | |
'already', 'also', 'although', 'always', 'am', 'among', | |
'amongst', 'amoungst', 'amount', 'an', 'and', 'another', | |
'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', | |
'are', 'around', 'as', 'at', 'back', 'be', | |
'became', 'because', 'become', 'becomes', 'becoming', 'been', | |
'before', 'beforehand', 'behind', 'being', 'below', 'beside', | |
'besides', 'between', 'beyond', 'bill', 'both', 'bottom', | |
'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont', | |
'co', 'computer', 'con', 'could', 'couldnt', 'cry', | |
'de', 'describe', 'detail', 'do', 'done', 'down', | |
'due', 'during', 'each', 'eg', 'eight', 'either', | |
'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', | |
'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', | |
'fify', 'fill', 'find', 'fire', 'first', 'five', | |
'for', 'former', 'formerly', 'forty', 'found', 'four', | |
'from', 'front', 'full', 'further', 'get', 'give', | |
'go', 'had', 'has', 'hasnt', 'have', 'he', | |
'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', | |
'hereupon', 'hers', 'herself', 'him', 'himself', 'his', | |
'how', 'however', 'hundred', 'i', 'ie', 'if', | |
'in', 'inc', 'indeed', 'interest', 'into', 'is', | |
'it', 'its', 'itself', 'keep', 'last', 'latter', | |
'latterly', 'least', 'less', 'ltd', 'made', 'many', | |
'may', 'me', 'meanwhile', 'might', 'mill', 'mine', | |
'more', 'moreover', 'most', 'mostly', 'move', 'much', | |
'must', 'my', 'myself', 'name', 'namely', 'neither', | |
'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', | |
'none', 'noone', 'nor', 'not', 'nothing', 'now', | |
'nowhere', 'of', 'off', 'often', 'on', 'once', | |
'one', 'only', 'onto', 'or', 'other', 'others', | |
'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', | |
'own', 'part', 'per', 'perhaps', 'please', 'put', | |
'rather', 're', 'same', 'see', 'seem', 'seemed', | |
'seeming', 'seems', 'serious', 'several', 'she', 'should', | |
'show', 'side', 'since', 'sincere', 'six', 'sixty', | |
'so', 'some', 'somehow', 'someone', 'something', 'sometime', | |
'sometimes', 'somewhere', 'still', 'such', 'system', 'take', | |
'ten', 'than', 'that', 'the', 'their', 'them', | |
'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', | |
'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', | |
'thin', 'third', 'this', 'those', 'though', 'three', | |
'through', 'throughout', 'thru', 'thus', 'to', 'together', | |
'too', 'top', 'toward', 'towards', 'twelve', 'twenty', | |
'two', 'un', 'under', 'until', 'up', 'upon', | |
'us', 'very', 'via', 'was', 'we', 'well', | |
'were', 'what', 'whatever', 'when', 'whence', 'whenever', | |
'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', | |
'wherever', 'whether', 'which', 'while', 'whither', 'who', | |
'whoever', 'whole', 'whom', 'whose', 'why', 'will', | |
'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', | |
'yourself', 'yourselves' | |
]) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment