Skip to content

Instantly share code, notes, and snippets.

@thejefflarson
Created December 21, 2011 23:55
Show Gist options
  • Save thejefflarson/1508278 to your computer and use it in GitHub Desktop.
Save thejefflarson/1508278 to your computer and use it in GitHub Desktop.
require 'rubygems'
require 'matrix'
require 'lingua/stemmer'
require 'csv'
require 'erb'
require 'iconv'
require 'sanitize'
require './cleaner.rb'
STEMMER = Lingua::Stemmer.new
class Document < Hash
attr_reader :text
def initialize(text)
super(0.0)
@text = Sanitize.clean(TextCleaner.new.clean(text))
@tokens = []
tokenize
stem!
classify!
end
def normalize(corpus)
each do |k, v|
self[k] = v / corpus[k] if corpus[k] > 0
end
end
# calculate pearson score
def compare(other, tlength)
raise TypeError unless other.is_a? Document
sum_mine = values.reduce(0, &:+)
sum_theirs = other.values.reduce(0, &:+)
sq_sum_mine = values.map {|i| i ** 2 }.reduce(0, &:+)
sq_sum_theirs = other.values.map {|i| i ** 2 }.reduce(0, &:+)
products = keys.reduce(0) do |memo, key|
memo += self[key] * other[key]
memo
end
numerator = products - (sum_mine * sum_theirs / tlength)
denominator = Math.sqrt((sq_sum_mine - (sum_mine ** 2) / tlength) *
(sq_sum_theirs - (sum_theirs ** 2) / tlength))
return 0.0 if denominator == 0
numerator / denominator
end
def inspect
"<Document @text=#{@text.slice(0..100).gsub(/\n+/, " ") + "..."}>"
end
private
def tokenize
@tokens = @text.downcase.gsub(/[^\w\s]/, "").
gsub(/[0-9]+/, "").
split(/\s+/).
delete_if { |it| STOPWORDS.include? it }.
delete_if { |it| it.length == 1 && it != 'i' }
end
def stem!
@tokens.map! {|tok| STEMMER.stem(tok)}
end
def classify!
merge! @tokens.reduce(Hash.new(0.0)) { |memo, tok| memo[tok] += 1; memo }
end
end
class Cluster
attr_accessor :documents
def initialize(doc, tlength)
@documents = [doc]
@tlength = tlength
end
def lonely?
@documents.length <= 2
end
def conj(doc, threshold = 0.75..0.95)
avg = @documents.map { |d| score = d.compare doc, @tlength }.reduce(0, &:+) / @documents.length
if threshold.include? avg # remove straight duplicates
@documents = @documents << doc
return true
else
return false
end
end
def length
@documents.length
end
end
class Corpus
attr_reader :documents, :corpus, :clusters
def initialize(documents)
corpus = {}
i = 0
@documents = documents.map do |doc|
d = Document.new doc
print "corpified: #{i = i+1} \r"
STDOUT.flush
corpus.merge! d
d
end
puts
@length = corpus.length
@corpus = corpus
@documents.each { |doc| doc.normalize(corpus) }
cluster!
end
private
def cluster!
to_do = @documents.dup
@clusters = []
while !to_do.empty?
cluster = Cluster.new(to_do.shift, @length)
delete = []
to_do.each do |other|
delete << other if cluster.conj other
end
to_do = to_do - delete
@clusters = clusters << cluster
print "to go: #{to_do.length} \r"
STDOUT.flush
end
puts
end
end
STOPWORDS = ["a","about","above","across","after","afterwards","again","against","all","almost","alone","along","already","also","although","always","am","among","amongst","amoungst","amount","an","and","another","any","anyhow","anyone","anything","anyway","anywhere","are","around","as","at","back","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","below","beside","besides","between","beyond","bill","both","bottom","but","by","call","can","cannot","cant","co","computer","con","could","couldnt","cry","de","describe","detail","do","done","down","due","during","each","eg","eight","either","eleven","else","elsewhere","empty","enough","etc","even","ever","every","everyone","everything","everywhere","except","few","fifteen","fify","fill","find","fire","first","five","for","former","formerly","forty","found","four","from","front","full","further","get","give","go","had","has","hasnt","have","he","hence","her","here","hereafter","hereby","herein","hereupon","hers","herself","him","himself","his","how","however","hundred","i","ie","if","in","inc","indeed","interest","into","is","it","its","itself","keep","last","latter","latterly","least","less","ltd","made","many","may","me","meanwhile","might","mill","mine","more","moreover","most","mostly","move","much","must","my","myself","name","namely","neither","never","nevertheless","next","nine","no","nobody","none","noone","nor","not","nothing","now","nowhere","of","off","often","on","once","one","only","onto","or","other","others","otherwise","our","ours","ourselves","out","over","own","part","per","perhaps","please","put","rather","re","same","see","seem","seemed","seeming","seems","serious","several","she","should","show","side","since","sincere","six","sixty","so","some","somehow","someone","something","sometime","sometimes","somewhere","still","such","system","take","ten","than","that","the","their","them","themselves","then","thence","there","thereafter","thereby","therefore","therein","thereupon","these","they","thick","thin","third","this","those","though","three","through","throughout","thru","thus","to","together","too","top","toward","towards","twelve","twenty","two","un","under","until","up","upon","us","very","via","was","we","well","were","what","whatever","when","whence","whenever","where","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","whoever","whole","whom","whose","why","will","with","within","without","would","yet","you","your","yours","yourself","yourselves"]
if __FILE__ == $0
puts "reading docs"
docs = Dir[File.join(File.dirname(__FILE__), "comments", "*.txt")].map do |f|
d = Iconv.iconv('ascii//translit//ignore', 'utf-8', File.read(f)).first
d.split(/([\dl])\s*of\s*\1/)
end.flatten
puts "corpifying"
c = Corpus.new docs
template = ERB.new <<-EOF
<style>
pre {width: 600px; white-space:pre-wrap}
hr {width: 600px}
</style>
<% c.clusters.each_with_index do |cluster, idx| %>
<% next if cluster.lonely? %>
<h1>Cluster <%= idx %></h1>
<% cluster.documents.each do |doc| %>
<hr>
<pre><%= doc.text%></pre>
<% end %>
<% end %>
EOF
File.open("out.html", "w") {|f| f.write template.result(binding) }
puts "Total docs in groups: #{c.clusters.reject(&:lonely?).reduce(0) { |memo, cluster| memo + cluster.length }}"
puts "Total docs #{docs.length}"
exit
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment