agarie · December 15, 2015 05:39
diff --git a/naive-bayes.rb b/naive-bayes.rb
 # A very simple naive bayes classifier for english text.
 #
 # Author: Carlos Agarie <[email protected]>

 # Some assumptions:
 # - The stop words, dataset and test data are hardcoded.
 # - Binary classification (so each phrase has a corresponding boolean value).
 # - If there's a missing feature, it's ignored.
 #
 # Of course this isn't a finished "product". I'm studying classifiers, so this
 # is but a prototype. I intend to make it more general (but not much) by adding
 # the capability to use it on different kinds of data (numerical, text and
 # discrete) and for it to use NMatrix and possibly Statsample as underlying
 # libraries.

 require 'set' 

 require 'fast_stemmer' # Adds String#stem.

 # Taken from https://github.com/alexandru/stuff-classifier
 @stop_words = Set.new([
    'a', 'about', 'above', 'across', 'after', 'afterwards', 
    'again', 'against', 'all', 'almost', 'alone', 'along', 
    'already', 'also', 'although', 'always', 'am', 'among', 
    'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 
    'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 
    'are', 'around', 'as', 'at', 'back', 'be', 
    'became', 'because', 'become', 'becomes', 'becoming', 'been', 
    'before', 'beforehand', 'behind', 'being', 'below', 'beside', 
    'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 
    'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
    'co', 'computer', 'con', 'could', 'couldnt', 'cry', 
    'de', 'describe', 'detail', 'do', 'done', 'down', 
    'due', 'during', 'each', 'eg', 'eight', 'either', 
    'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever',
    'every', 'everyone', 'everything', 'everywhere', 'except', 'few',
    'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 
    'for', 'former', 'formerly', 'forty', 'found', 'four', 
    'from', 'front', 'full', 'further', 'get', 'give', 
    'go', 'had', 'has', 'hasnt', 'have', 'he', 
    'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 
    'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 
    'how', 'however', 'hundred', 'i', 'ie', 'if', 
    'in', 'inc', 'indeed', 'interest', 'into', 'is', 
    'it', 'its', 'itself', 'keep', 'last', 'latter', 
    'latterly', 'least', 'less', 'ltd', 'made', 'many', 
    'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 
    'more', 'moreover', 'most', 'mostly', 'move', 'much', 
    'must', 'my', 'myself', 'name', 'namely', 'neither', 
    'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 
    'none', 'noone', 'nor', 'not', 'nothing', 'now', 
    'nowhere', 'of', 'off', 'often', 'on', 'once', 
    'one', 'only', 'onto', 'or', 'other', 'others', 
    'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 
    'own', 'part', 'per', 'perhaps', 'please', 'put', 
    'rather', 're', 'same', 'see', 'seem', 'seemed', 
    'seeming', 'seems', 'serious', 'several', 'she', 'should', 
    'show', 'side', 'since', 'sincere', 'six', 'sixty', 
    'so', 'some', 'somehow', 'someone', 'something', 'sometime', 
    'sometimes', 'somewhere', 'still', 'such', 'system', 'take', 
    'ten', 'than', 'that', 'the', 'their', 'them', 
    'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 
    'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 
    'thin', 'third', 'this', 'those', 'though', 'three', 
    'through', 'throughout', 'thru', 'thus', 'to', 'together', 
    'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 
    'two', 'un', 'under', 'until', 'up', 'upon', 
    'us', 'very', 'via', 'was', 'we', 'well', 
    'were', 'what', 'whatever', 'when', 'whence', 'whenever', 
    'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 
    'wherever', 'whether', 'which', 'while', 'whither', 'who', 
    'whoever', 'whole', 'whom', 'whose', 'why', 'will', 
    'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 
    'yourself', 'yourselves'
 ])

 # The dataset is made of two element arrays: a boolean and a string.
 # Most of it was taken from another implementation of naive bayes in Ruby:
 # bionicspirit.com/blog/2012/02/09/howto-build-naive-bayes-classifier.html.
 # The other examples are from the Wikipedia articles about dogs and cats.
 @dataset = Set.new([
  [true, 'Dogs are awesome, cats too. I love my dog'],
  [false, "Cats are more preferred by software developers. I never could stand cats. I have a dog"],
  [true, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs"],
  [false, "Cats are difficult animals, unlike dogs, really annoying, I hate them all"],
  [true, "So which one should you choose? A dog, definitely."],
  [false, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy"],
  [true, "A dog will eat anything, including birds or whatever meat"],
  [false, "My cat's favorite place to purr is on my keyboard"],
  [true, "My dog's favorite place to take a leak is the tree in front of our house"],
  [false, "The domestic cat (Felis catus or Felis silvestris catus) is a small, usually furry, domesticated, and carnivorous mammal. It is often called the housecat when kept as an indoor pet, or simply the cat when there is no need to distinguish it from other felids and felines. Cats are valued by humans for companionship and their ability to hunt vermin and household pests."],
  [false, "Despite being solitary hunters, cats are a social species, and cat communication includes the use of a variety of vocalizations (meowing, purring, trilling, hissing, growling and grunting) as well as cat pheromones and types of cat-specific body language."],
  [true, "The domestic dog (Canis lupus familiaris) is a subspecies of the gray wolf (Canis lupus), a member of the Canidae family of the mammalian order Carnivora. The term \"domestic dog\" is generally used for both domesticated and feral varieties. The dog has been the first animal to be domesticated and has been the most widely kept working, hunting, and pet animal in human history. The word \"dog\" may also mean the male of a canine species, as opposed to the word \"bitch\" for the female of the species."],
  [true, "Most breeds of dogs are at most a few hundred years old, having been artificially selected for particular morphologies and behaviors by people for specific functional roles."],
  [true, "Through this selective breeding, the dog has developed into hundreds of varied breeds, and shows more behavioral and morphological variation than any other land mammal."]
 ])

 # Taken from the same implementation mentioned in the dataset and from
 # Wikipedia as well.
 @tests = Set.new(["This test is about cats",
 "This test is about dogs",
 %$The domestic cat was first classified as Felis catus by Carolus Linnaeus in the tenth edition of his Systema Naturae in. However, because of modern phylogenetics, domestic cats are now usually regarded as another subspecies of the wildcat, Felis silvestris. This has resulted in mixed usage of the terms, as the domestic cat can be called by its subspecies name, Felis silvestris catus. Wildcats have also been referred to as various subspecies of F. catus, but in the International Commission on Zoological Nomenclature fixed the name for wildcats as F. silvestris. The most common name in use for the domestic cat remains F. catus, following a convention for domesticated animals of using the earliest (the senior) synonym proposed. Sometimes the domestic cat has been called Felis domesticus or Felis domestica, as proposed by German naturalist J. C. P. Erxleben in, but these are not valid taxonomic names and have been used only rarely in scientific literature, because Linnaeus's binomial takes precedence.$,
 %$Although experts largely disagree over the details of dog domestication, it is agreed that human interaction played a significant role in shaping the subspecies. Domestication may have occurred initially in separate areas, particularly Siberia and Europe. Currently it is thought domestication of our current lineage of dog occurred sometime as early as years ago and arguably as late as years ago. Shortly after the latest domestication, dogs became ubiquitous in human populations, and spread throughout the world.$])

 # Each category has a hash of counters, one counter for each word.
 @counters = {
  true => Hash.new(0),
  false => Hash.new(0)
 }
 @categories_counter = Hash.new(0)
 @words_counter = Hash.new(0)

 # Clean the string by stripping symbols and numbers, removing stop words and
 # subsequently stemming each word. Return an array of words.
 def process(str)
  str.gsub(/[,.?!\\()0-9\[\]]/, '').
      split(' ').
      reject { |w| @stop_words.include?(w.downcase) }.
      map { |w| w.stem }
 end

 # Add information from the example to the model.
 def train(example)
  words = process(example[-1])
    
  @categories_counter[example.first] += 1

  words.each do |w|
    # +1 appearance of this word in this category.
    @counters[example.first][w] += 1
    
    # +1 appearance of this word.
    @words_counter[w] += 1
 	end
 end

 # Return true if the given document is from the category being tested, false
 # otherwise.
 def classify(doc)
  num_of_docs = @dataset.size
  words = process(doc)
    
  true_prob, false_prob = [true, false].map do |k|
    cat_p = @categories_counter[k].to_f / num_of_docs.to_f
    
    # Now we need to implement the probability calculation for each word given
    # the category (true or false). Logarithms are used for numerical stability.
    prob = words.reduce(0.0) do |acc, word|
      # These are integers, so it's ok to test for inequality.
      if @counters[k][word] != 0 && @words_counter[word] != 0
        acc += Math.log2((@counters[k][word].to_f / @words_counter[word].to_f) / cat_p.to_f)
      end
      
      acc
    end
    
    Math.log2(cat_p) + prob
  end
    
  # Return true if true_final > false_final, false otherwise.
  true_prob > false_prob
 end

 # Let's test it!
 @dataset.each { |d| train(d) }

 @tests.each do |t|
  b = classify(t)

  text = t.size > 100 ? t[0..100] + "..." : t
  puts text
  
  if b then puts "Result: Dog" else puts "Result: Cat" end  
 end
	# A very simple naive bayes classifier for english text.
	#
	# Author: Carlos Agarie <[email protected]>

	# Some assumptions:
	# - The stop words, dataset and test data are hardcoded.
	# - Binary classification (so each phrase has a corresponding boolean value).
	# - If there's a missing feature, it's ignored.
	#
	# Of course this isn't a finished "product". I'm studying classifiers, so this
	# is but a prototype. I intend to make it more general (but not much) by adding
	# the capability to use it on different kinds of data (numerical, text and
	# discrete) and for it to use NMatrix and possibly Statsample as underlying
	# libraries.

	require 'set'

	require 'fast_stemmer' # Adds String#stem.

	# Taken from https://github.com/alexandru/stuff-classifier
	@stop_words = Set.new([
	'a', 'about', 'above', 'across', 'after', 'afterwards',
	'again', 'against', 'all', 'almost', 'alone', 'along',
	'already', 'also', 'although', 'always', 'am', 'among',
	'amongst', 'amoungst', 'amount', 'an', 'and', 'another',
	'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere',
	'are', 'around', 'as', 'at', 'back', 'be',
	'became', 'because', 'become', 'becomes', 'becoming', 'been',
	'before', 'beforehand', 'behind', 'being', 'below', 'beside',
	'besides', 'between', 'beyond', 'bill', 'both', 'bottom',
	'but', 'by', 'call', 'can', 'cannot', 'cant', 'dont',
	'co', 'computer', 'con', 'could', 'couldnt', 'cry',
	'de', 'describe', 'detail', 'do', 'done', 'down',
	'due', 'during', 'each', 'eg', 'eight', 'either',
	'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever',
	'every', 'everyone', 'everything', 'everywhere', 'except', 'few',
	'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five',
	'for', 'former', 'formerly', 'forty', 'found', 'four',
	'from', 'front', 'full', 'further', 'get', 'give',
	'go', 'had', 'has', 'hasnt', 'have', 'he',
	'hence', 'her', 'here', 'hereafter', 'hereby', 'herein',
	'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
	'how', 'however', 'hundred', 'i', 'ie', 'if',
	'in', 'inc', 'indeed', 'interest', 'into', 'is',
	'it', 'its', 'itself', 'keep', 'last', 'latter',
	'latterly', 'least', 'less', 'ltd', 'made', 'many',
	'may', 'me', 'meanwhile', 'might', 'mill', 'mine',
	'more', 'moreover', 'most', 'mostly', 'move', 'much',
	'must', 'my', 'myself', 'name', 'namely', 'neither',
	'never', 'nevertheless', 'next', 'nine', 'no', 'nobody',
	'none', 'noone', 'nor', 'not', 'nothing', 'now',
	'nowhere', 'of', 'off', 'often', 'on', 'once',
	'one', 'only', 'onto', 'or', 'other', 'others',
	'otherwise', 'our', 'ours', 'ourselves', 'out', 'over',
	'own', 'part', 'per', 'perhaps', 'please', 'put',
	'rather', 're', 'same', 'see', 'seem', 'seemed',
	'seeming', 'seems', 'serious', 'several', 'she', 'should',
	'show', 'side', 'since', 'sincere', 'six', 'sixty',
	'so', 'some', 'somehow', 'someone', 'something', 'sometime',
	'sometimes', 'somewhere', 'still', 'such', 'system', 'take',
	'ten', 'than', 'that', 'the', 'their', 'them',
	'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby',
	'therefore', 'therein', 'thereupon', 'these', 'they', 'thick',
	'thin', 'third', 'this', 'those', 'though', 'three',
	'through', 'throughout', 'thru', 'thus', 'to', 'together',
	'too', 'top', 'toward', 'towards', 'twelve', 'twenty',
	'two', 'un', 'under', 'until', 'up', 'upon',
	'us', 'very', 'via', 'was', 'we', 'well',
	'were', 'what', 'whatever', 'when', 'whence', 'whenever',
	'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
	'wherever', 'whether', 'which', 'while', 'whither', 'who',
	'whoever', 'whole', 'whom', 'whose', 'why', 'will',
	'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours',
	'yourself', 'yourselves'
	])

	# The dataset is made of two element arrays: a boolean and a string.
	# Most of it was taken from another implementation of naive bayes in Ruby:
	# bionicspirit.com/blog/2012/02/09/howto-build-naive-bayes-classifier.html.
	# The other examples are from the Wikipedia articles about dogs and cats.
	@dataset = Set.new([
	[true, 'Dogs are awesome, cats too. I love my dog'],
	[false, "Cats are more preferred by software developers. I never could stand cats. I have a dog"],
	[true, "My dog's name is Willy. He likes to play with my wife's cat all day long. I love dogs"],
	[false, "Cats are difficult animals, unlike dogs, really annoying, I hate them all"],
	[true, "So which one should you choose? A dog, definitely."],
	[false, "The favorite food for cats is bird meat, although mice are good, but birds are a delicacy"],
	[true, "A dog will eat anything, including birds or whatever meat"],
	[false, "My cat's favorite place to purr is on my keyboard"],
	[true, "My dog's favorite place to take a leak is the tree in front of our house"],
	[false, "The domestic cat (Felis catus or Felis silvestris catus) is a small, usually furry, domesticated, and carnivorous mammal. It is often called the housecat when kept as an indoor pet, or simply the cat when there is no need to distinguish it from other felids and felines. Cats are valued by humans for companionship and their ability to hunt vermin and household pests."],
	[false, "Despite being solitary hunters, cats are a social species, and cat communication includes the use of a variety of vocalizations (meowing, purring, trilling, hissing, growling and grunting) as well as cat pheromones and types of cat-specific body language."],
	[true, "The domestic dog (Canis lupus familiaris) is a subspecies of the gray wolf (Canis lupus), a member of the Canidae family of the mammalian order Carnivora. The term \"domestic dog\" is generally used for both domesticated and feral varieties. The dog has been the first animal to be domesticated and has been the most widely kept working, hunting, and pet animal in human history. The word \"dog\" may also mean the male of a canine species, as opposed to the word \"bitch\" for the female of the species."],
	[true, "Most breeds of dogs are at most a few hundred years old, having been artificially selected for particular morphologies and behaviors by people for specific functional roles."],
	[true, "Through this selective breeding, the dog has developed into hundreds of varied breeds, and shows more behavioral and morphological variation than any other land mammal."]
	])

	# Taken from the same implementation mentioned in the dataset and from
	# Wikipedia as well.
	@tests = Set.new(["This test is about cats",
	"This test is about dogs",
	%$The domestic cat was first classified as Felis catus by Carolus Linnaeus in the tenth edition of his Systema Naturae in. However, because of modern phylogenetics, domestic cats are now usually regarded as another subspecies of the wildcat, Felis silvestris. This has resulted in mixed usage of the terms, as the domestic cat can be called by its subspecies name, Felis silvestris catus. Wildcats have also been referred to as various subspecies of F. catus, but in the International Commission on Zoological Nomenclature fixed the name for wildcats as F. silvestris. The most common name in use for the domestic cat remains F. catus, following a convention for domesticated animals of using the earliest (the senior) synonym proposed. Sometimes the domestic cat has been called Felis domesticus or Felis domestica, as proposed by German naturalist J. C. P. Erxleben in, but these are not valid taxonomic names and have been used only rarely in scientific literature, because Linnaeus's binomial takes precedence.$,
	%$Although experts largely disagree over the details of dog domestication, it is agreed that human interaction played a significant role in shaping the subspecies. Domestication may have occurred initially in separate areas, particularly Siberia and Europe. Currently it is thought domestication of our current lineage of dog occurred sometime as early as years ago and arguably as late as years ago. Shortly after the latest domestication, dogs became ubiquitous in human populations, and spread throughout the world.$])

	# Each category has a hash of counters, one counter for each word.
	@counters = {
	true => Hash.new(0),
	false => Hash.new(0)
	}
	@categories_counter = Hash.new(0)
	@words_counter = Hash.new(0)

	# Clean the string by stripping symbols and numbers, removing stop words and
	# subsequently stemming each word. Return an array of words.
	def process(str)
	str.gsub(/[,.?!\\()0-9\[\]]/, '').
	split(' ').
	reject { \|w\| @stop_words.include?(w.downcase) }.
	map { \|w\| w.stem }
	end

	# Add information from the example to the model.
	def train(example)
	words = process(example[-1])

	@categories_counter[example.first] += 1

	words.each do \|w\|
	# +1 appearance of this word in this category.
	@counters[example.first][w] += 1

	# +1 appearance of this word.
	@words_counter[w] += 1
	end
	end

	# Return true if the given document is from the category being tested, false
	# otherwise.
	def classify(doc)
	num_of_docs = @dataset.size
	words = process(doc)

	true_prob, false_prob = [true, false].map do \|k\|
	cat_p = @categories_counter[k].to_f / num_of_docs.to_f

	# Now we need to implement the probability calculation for each word given
	# the category (true or false). Logarithms are used for numerical stability.
	prob = words.reduce(0.0) do \|acc, word\|
	# These are integers, so it's ok to test for inequality.
	if @counters[k][word] != 0 && @words_counter[word] != 0
	acc += Math.log2((@counters[k][word].to_f / @words_counter[word].to_f) / cat_p.to_f)
	end

	acc
	end

	Math.log2(cat_p) + prob
	end

	# Return true if true_final > false_final, false otherwise.
	true_prob > false_prob
	end

	# Let's test it!
	@dataset.each { \|d\| train(d) }

	@tests.each do \|t\|
	b = classify(t)

	text = t.size > 100 ? t[0..100] + "..." : t
	puts text

	if b then puts "Result: Dog" else puts "Result: Cat" end
	end