Skip to content

Instantly share code, notes, and snippets.

@nownabe
Created December 13, 2016 06:51
Show Gist options
  • Save nownabe/5d39338f1bf950fe7facb86f82fa8e34 to your computer and use it in GitHub Desktop.
Save nownabe/5d39338f1bf950fe7facb86f82fa8e34 to your computer and use it in GitHub Desktop.
ruby-mecab-kmeans
require "kmeans-clusterer"
require "natto"
class WordDic
class << self
def [](word)
dictionary[word] ||= next_index
end
def size
dictionary.size
end
private
def dictionary
@dictionary ||= {}
end
def indexer
@indexer ||=
Fiber.new do
n = 0
loop do
Fiber.yield(n)
n += 1
end
end
end
def next_index
indexer.resume
end
end
end
class Parser
class << self
def parse(text)
mecab.enum_parse(text).to_a
end
private
def mecab
@mecab ||= Natto::MeCab.new
end
end
end
class Sentence
attr_reader :sentence, :vector
def initialize(sentence)
@sentence = sentence
@vector = generate_vector
end
def format(size)
@vector[size] = 0
@vector.map! { |e| e || 0 }
end
private
def generate_vector
nodes = Parser.parse(sentence)
nodes.select! { |n| n.feature =~ /名詞|動詞/ }
nodes.each_with_object([]) do |node, vector|
origin = node.feature.split(",")[6]
vector[WordDic[origin]] = 1
end
end
end
lines = File.read("feedbacks.txt").lines
# Remove English feedbacks
lines.reject! do |line|
line =~ /^[\s\w\(\),]+$/
end
# Remove URLs
lines.map! do |line|
line.chomp
.gsub(%r{https?://[\w/\.-]+}, "") # Remove URLs
.gsub(/topiv/, "") # Remove "topiv"
.gsub(/[\((](\s+)?[\))]/, "") # Remove empty brackets
end
parsed_sentences = lines.map { |l| Sentence.new(l) }
parsed_sentences.each { |s| s.format(WordDic.size) }
data = parsed_sentences.map { |s| s.vector }
labels = parsed_sentences.map { |s| s.sentence }
kmeans = KMeansClusterer.run(ARGV[0].to_i, data, labels: labels, runs: 10)
kmeans.clusters.each do |cluster|
puts "================"
puts "CLUSTER: #{cluster.id}"
puts cluster.points.map(&:label)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment