thejefflarson · December 21, 2011 23:55
diff --git a/clustering-testimony.rb b/clustering-testimony.rb
 require 'rubygems'
 require 'matrix'
 require 'lingua/stemmer'
 require 'csv'
 require 'erb'
 require 'iconv'
 require 'sanitize'
 require './cleaner.rb'

 STEMMER = Lingua::Stemmer.new

 class Document < Hash
  attr_reader :text
  def initialize(text)
    super(0.0)
    @text    = Sanitize.clean(TextCleaner.new.clean(text))
    @tokens  = []
    tokenize
    stem!
    classify!
  end

  def normalize(corpus)
    each do |k, v|
      self[k] = v / corpus[k] if corpus[k] > 0
    end
  end

  # calculate pearson score
  def compare(other, tlength)
    raise TypeError unless other.is_a? Document
    sum_mine      = values.reduce(0, &:+)
    sum_theirs    = other.values.reduce(0, &:+)
    sq_sum_mine   = values.map {|i| i ** 2 }.reduce(0, &:+)
    sq_sum_theirs = other.values.map {|i| i ** 2 }.reduce(0, &:+)
    products      = keys.reduce(0) do |memo, key|
      memo += self[key] * other[key]
      memo
    end
    numerator   = products - (sum_mine * sum_theirs / tlength)
    denominator = Math.sqrt((sq_sum_mine - (sum_mine ** 2) / tlength) *
                            (sq_sum_theirs - (sum_theirs ** 2) / tlength))

    return 0.0 if denominator == 0
    numerator / denominator
  end

  def inspect
    "<Document @text=#{@text.slice(0..100).gsub(/\n+/, " ") + "..."}>"
  end

 private

  def tokenize
    @tokens = @text.downcase.gsub(/[^\w\s]/, "").
                             gsub(/[0-9]+/, "").
                             split(/\s+/).
                             delete_if { |it| STOPWORDS.include? it }.
                             delete_if { |it| it.length == 1 && it != 'i' }
  end

  def stem!
    @tokens.map! {|tok| STEMMER.stem(tok)}
  end

  def classify!
    merge! @tokens.reduce(Hash.new(0.0)) { |memo, tok| memo[tok] += 1; memo }
  end
 end


 class Cluster
  attr_accessor :documents

  def initialize(doc, tlength)
    @documents = [doc]
    @tlength   = tlength
  end

  def lonely?
    @documents.length <= 2
  end

  def conj(doc, threshold = 0.75..0.95)
    avg = @documents.map { |d| score = d.compare doc, @tlength }.reduce(0, &:+) / @documents.length
    if threshold.include? avg # remove straight duplicates
      @documents = @documents << doc
      return true
    else
      return false
    end
  end

  def length
    @documents.length
  end
 end


 class Corpus
  attr_reader :documents, :corpus, :clusters
  def initialize(documents)
    corpus = {}
    i = 0
    @documents = documents.map do |doc|
      d = Document.new doc
      print "corpified: #{i = i+1} \r"
      STDOUT.flush
      corpus.merge! d
      d
    end
    puts
    @length = corpus.length
    @corpus = corpus
    @documents.each { |doc| doc.normalize(corpus) }
    cluster!
  end

 private

  def cluster!
    to_do = @documents.dup
    @clusters = []
    while !to_do.empty?
      cluster  = Cluster.new(to_do.shift, @length)
      delete   = []

      to_do.each do |other|
        delete << other if cluster.conj other
      end

      to_do = to_do - delete
      @clusters = clusters << cluster
      print "to go: #{to_do.length} \r"
      STDOUT.flush
    end
    puts
  end
 end

 STOPWORDS = ["a","about","above","across","after","afterwards","again","against","all","almost","alone","along","already","also","although","always","am","among","amongst","amoungst","amount","an","and","another","any","anyhow","anyone","anything","anyway","anywhere","are","around","as","at","back","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","below","beside","besides","between","beyond","bill","both","bottom","but","by","call","can","cannot","cant","co","computer","con","could","couldnt","cry","de","describe","detail","do","done","down","due","during","each","eg","eight","either","eleven","else","elsewhere","empty","enough","etc","even","ever","every","everyone","everything","everywhere","except","few","fifteen","fify","fill","find","fire","first","five","for","former","formerly","forty","found","four","from","front","full","further","get","give","go","had","has","hasnt","have","he","hence","her","here","hereafter","hereby","herein","hereupon","hers","herself","him","himself","his","how","however","hundred","i","ie","if","in","inc","indeed","interest","into","is","it","its","itself","keep","last","latter","latterly","least","less","ltd","made","many","may","me","meanwhile","might","mill","mine","more","moreover","most","mostly","move","much","must","my","myself","name","namely","neither","never","nevertheless","next","nine","no","nobody","none","noone","nor","not","nothing","now","nowhere","of","off","often","on","once","one","only","onto","or","other","others","otherwise","our","ours","ourselves","out","over","own","part","per","perhaps","please","put","rather","re","same","see","seem","seemed","seeming","seems","serious","several","she","should","show","side","since","sincere","six","sixty","so","some","somehow","someone","something","sometime","sometimes","somewhere","still","such","system","take","ten","than","that","the","their","them","themselves","then","thence","there","thereafter","thereby","therefore","therein","thereupon","these","they","thick","thin","third","this","those","though","three","through","throughout","thru","thus","to","together","too","top","toward","towards","twelve","twenty","two","un","under","until","up","upon","us","very","via","was","we","well","were","what","whatever","when","whence","whenever","where","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","whoever","whole","whom","whose","why","will","with","within","without","would","yet","you","your","yours","yourself","yourselves"]

 if __FILE__ == $0
  puts "reading docs"
  docs = Dir[File.join(File.dirname(__FILE__), "comments", "*.txt")].map do |f|
    d = Iconv.iconv('ascii//translit//ignore', 'utf-8', File.read(f)).first
    d.split(/([\dl])\s*of\s*\1/)
  end.flatten
  puts "corpifying"
  c = Corpus.new docs

  template = ERB.new <<-EOF
    <style>
      pre {width: 600px; white-space:pre-wrap}
      hr {width: 600px}
    </style>
    <% c.clusters.each_with_index do |cluster, idx| %>
      <% next if cluster.lonely? %>
      <h1>Cluster <%= idx %></h1>
      <% cluster.documents.each do |doc| %>
        <hr>
        <pre><%= doc.text%></pre>
      <% end %>
    <% end %>
  EOF

  File.open("out.html", "w") {|f| f.write template.result(binding) }
  puts "Total docs in groups: #{c.clusters.reject(&:lonely?).reduce(0) { |memo, cluster| memo + cluster.length }}"
  puts "Total docs #{docs.length}"
  exit
 end
	require 'rubygems'
	require 'matrix'
	require 'lingua/stemmer'
	require 'csv'
	require 'erb'
	require 'iconv'
	require 'sanitize'
	require './cleaner.rb'

	STEMMER = Lingua::Stemmer.new

	class Document < Hash
	attr_reader :text
	def initialize(text)
	super(0.0)
	@text = Sanitize.clean(TextCleaner.new.clean(text))
	@tokens = []
	tokenize
	stem!
	classify!
	end

	def normalize(corpus)
	each do \|k, v\|
	self[k] = v / corpus[k] if corpus[k] > 0
	end
	end

	# calculate pearson score
	def compare(other, tlength)
	raise TypeError unless other.is_a? Document
	sum_mine = values.reduce(0, &:+)
	sum_theirs = other.values.reduce(0, &:+)
	sq_sum_mine = values.map {\|i\| i ** 2 }.reduce(0, &:+)
	sq_sum_theirs = other.values.map {\|i\| i ** 2 }.reduce(0, &:+)
	products = keys.reduce(0) do \|memo, key\|
	memo += self[key] * other[key]
	memo
	end
	numerator = products - (sum_mine * sum_theirs / tlength)
	denominator = Math.sqrt((sq_sum_mine - (sum_mine ** 2) / tlength) *
	(sq_sum_theirs - (sum_theirs ** 2) / tlength))

	return 0.0 if denominator == 0
	numerator / denominator
	end

	def inspect
	"<Document @text=#{@text.slice(0..100).gsub(/\n+/, " ") + "..."}>"
	end

	private

	def tokenize
	@tokens = @text.downcase.gsub(/[^\w\s]/, "").
	gsub(/[0-9]+/, "").
	split(/\s+/).
	delete_if { \|it\| STOPWORDS.include? it }.
	delete_if { \|it\| it.length == 1 && it != 'i' }
	end

	def stem!
	@tokens.map! {\|tok\| STEMMER.stem(tok)}
	end

	def classify!
	merge! @tokens.reduce(Hash.new(0.0)) { \|memo, tok\| memo[tok] += 1; memo }
	end
	end


	class Cluster
	attr_accessor :documents

	def initialize(doc, tlength)
	@documents = [doc]
	@tlength = tlength
	end

	def lonely?
	@documents.length <= 2
	end

	def conj(doc, threshold = 0.75..0.95)
	avg = @documents.map { \|d\| score = d.compare doc, @tlength }.reduce(0, &:+) / @documents.length
	if threshold.include? avg # remove straight duplicates
	@documents = @documents << doc
	return true
	else
	return false
	end
	end

	def length
	@documents.length
	end
	end


	class Corpus
	attr_reader :documents, :corpus, :clusters
	def initialize(documents)
	corpus = {}
	i = 0
	@documents = documents.map do \|doc\|
	d = Document.new doc
	print "corpified: #{i = i+1} \r"
	STDOUT.flush
	corpus.merge! d
	d
	end
	puts
	@length = corpus.length
	@corpus = corpus
	@documents.each { \|doc\| doc.normalize(corpus) }
	cluster!
	end

	private

	def cluster!
	to_do = @documents.dup
	@clusters = []
	while !to_do.empty?
	cluster = Cluster.new(to_do.shift, @length)
	delete = []

	to_do.each do \|other\|
	delete << other if cluster.conj other
	end

	to_do = to_do - delete
	@clusters = clusters << cluster
	print "to go: #{to_do.length} \r"
	STDOUT.flush
	end
	puts
	end
	end

	STOPWORDS = ["a","about","above","across","after","afterwards","again","against","all","almost","alone","along","already","also","although","always","am","among","amongst","amoungst","amount","an","and","another","any","anyhow","anyone","anything","anyway","anywhere","are","around","as","at","back","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","below","beside","besides","between","beyond","bill","both","bottom","but","by","call","can","cannot","cant","co","computer","con","could","couldnt","cry","de","describe","detail","do","done","down","due","during","each","eg","eight","either","eleven","else","elsewhere","empty","enough","etc","even","ever","every","everyone","everything","everywhere","except","few","fifteen","fify","fill","find","fire","first","five","for","former","formerly","forty","found","four","from","front","full","further","get","give","go","had","has","hasnt","have","he","hence","her","here","hereafter","hereby","herein","hereupon","hers","herself","him","himself","his","how","however","hundred","i","ie","if","in","inc","indeed","interest","into","is","it","its","itself","keep","last","latter","latterly","least","less","ltd","made","many","may","me","meanwhile","might","mill","mine","more","moreover","most","mostly","move","much","must","my","myself","name","namely","neither","never","nevertheless","next","nine","no","nobody","none","noone","nor","not","nothing","now","nowhere","of","off","often","on","once","one","only","onto","or","other","others","otherwise","our","ours","ourselves","out","over","own","part","per","perhaps","please","put","rather","re","same","see","seem","seemed","seeming","seems","serious","several","she","should","show","side","since","sincere","six","sixty","so","some","somehow","someone","something","sometime","sometimes","somewhere","still","such","system","take","ten","than","that","the","their","them","themselves","then","thence","there","thereafter","thereby","therefore","therein","thereupon","these","they","thick","thin","third","this","those","though","three","through","throughout","thru","thus","to","together","too","top","toward","towards","twelve","twenty","two","un","under","until","up","upon","us","very","via","was","we","well","were","what","whatever","when","whence","whenever","where","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","whoever","whole","whom","whose","why","will","with","within","without","would","yet","you","your","yours","yourself","yourselves"]

	if __FILE__ == $0
	puts "reading docs"
	docs = Dir[File.join(File.dirname(__FILE__), "comments", "*.txt")].map do \|f\|
	d = Iconv.iconv('ascii//translit//ignore', 'utf-8', File.read(f)).first
	d.split(/([\dl])\sof\s\1/)
	end.flatten
	puts "corpifying"
	c = Corpus.new docs

	template = ERB.new <<-EOF
	<style>
	pre {width: 600px; white-space:pre-wrap}
	hr {width: 600px}
	</style>
	<% c.clusters.each_with_index do \|cluster, idx\| %>
	<% next if cluster.lonely? %>
	<h1>Cluster <%= idx %></h1>
	<% cluster.documents.each do \|doc\| %>
	<hr>
	<pre><%= doc.text%></pre>
	<% end %>
	<% end %>
	EOF

	File.open("out.html", "w") {\|f\| f.write template.result(binding) }
	puts "Total docs in groups: #{c.clusters.reject(&:lonely?).reduce(0) { \|memo, cluster\| memo + cluster.length }}"
	puts "Total docs #{docs.length}"
	exit
	end