MollsReis · January 21, 2016 22:54
diff --git a/mnb.rb b/mnb.rb
 class MNB
  def initialize(examples)
    @examples = examples.map { |ex| [ex.first.gsub(/[^a-zA-Z]/, ' ').downcase.split, ex.last] }
    @buckets = @examples.map { |ex| ex.last }.uniq
    @vocab_size = @examples.map { |ex| ex.first }.flatten.uniq.count
    @prob_bucket = Hash.new do |hash, bucket|
      hash[bucket] = @examples.count { |ex| ex.last == bucket } / @examples.count.to_f
    end
    @prob_word_given_bucket = Hash.new do |hash, word_bucket|
      word, bucket = word_bucket.split('__')
      num = @examples.reduce(0) { |count, ex| ex.last == bucket ? count + ex.first.count(word) : count } + 1
      dom = @examples.reduce(0) { |count, ex| ex.last == bucket ? count + ex.first.count : count } + @vocab_size
      hash[word_bucket] = num / dom.to_f
    end
  end

  def prob_bucket(bucket)
    @prob_bucket[bucket]
  end

  def prob_word_given_bucket(word, bucket)
    @prob_word_given_bucket[word + '__' + bucket]
  end

  def classify(words)
    @buckets.reduce({}) do |result, bucket|
      word_prob = words.split.reduce(1) { |product, word| product * prob_word_given_bucket(word, bucket) }
      result[bucket] = prob_bucket(bucket) * word_prob
      result
    end.max_by { |k,v| v }.first
  end
 end

 require 'csv'
 puts MNB.new(CSV.new(DATA).to_a).classify('china china china tokyo japan').inspect # => "c"

 __END__
 china beijing china,c
 china china shanghai,c
 china macao,c
 tokyo japan china,j
	class MNB
	def initialize(examples)
	@examples = examples.map { \|ex\| [ex.first.gsub(/[^a-zA-Z]/, ' ').downcase.split, ex.last] }
	@buckets = @examples.map { \|ex\| ex.last }.uniq
	@vocab_size = @examples.map { \|ex\| ex.first }.flatten.uniq.count
	@prob_bucket = Hash.new do \|hash, bucket\|
	hash[bucket] = @examples.count { \|ex\| ex.last == bucket } / @examples.count.to_f
	end
	@prob_word_given_bucket = Hash.new do \|hash, word_bucket\|
	word, bucket = word_bucket.split('__')
	num = @examples.reduce(0) { \|count, ex\| ex.last == bucket ? count + ex.first.count(word) : count } + 1
	dom = @examples.reduce(0) { \|count, ex\| ex.last == bucket ? count + ex.first.count : count } + @vocab_size
	hash[word_bucket] = num / dom.to_f
	end
	end

	def prob_bucket(bucket)
	@prob_bucket[bucket]
	end

	def prob_word_given_bucket(word, bucket)
	@prob_word_given_bucket[word + '__' + bucket]
	end

	def classify(words)
	@buckets.reduce({}) do \|result, bucket\|
	word_prob = words.split.reduce(1) { \|product, word\| product * prob_word_given_bucket(word, bucket) }
	result[bucket] = prob_bucket(bucket) * word_prob
	result
	end.max_by { \|k,v\| v }.first
	end
	end

	require 'csv'
	puts MNB.new(CSV.new(DATA).to_a).classify('china china china tokyo japan').inspect # => "c"

	__END__
	china beijing china,c
	china china shanghai,c
	china macao,c
	tokyo japan china,j