Created
December 13, 2016 06:51
-
-
Save nownabe/5d39338f1bf950fe7facb86f82fa8e34 to your computer and use it in GitHub Desktop.
ruby-mecab-kmeans
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "kmeans-clusterer" | |
require "natto" | |
class WordDic | |
class << self | |
def [](word) | |
dictionary[word] ||= next_index | |
end | |
def size | |
dictionary.size | |
end | |
private | |
def dictionary | |
@dictionary ||= {} | |
end | |
def indexer | |
@indexer ||= | |
Fiber.new do | |
n = 0 | |
loop do | |
Fiber.yield(n) | |
n += 1 | |
end | |
end | |
end | |
def next_index | |
indexer.resume | |
end | |
end | |
end | |
class Parser | |
class << self | |
def parse(text) | |
mecab.enum_parse(text).to_a | |
end | |
private | |
def mecab | |
@mecab ||= Natto::MeCab.new | |
end | |
end | |
end | |
class Sentence | |
attr_reader :sentence, :vector | |
def initialize(sentence) | |
@sentence = sentence | |
@vector = generate_vector | |
end | |
def format(size) | |
@vector[size] = 0 | |
@vector.map! { |e| e || 0 } | |
end | |
private | |
def generate_vector | |
nodes = Parser.parse(sentence) | |
nodes.select! { |n| n.feature =~ /名詞|動詞/ } | |
nodes.each_with_object([]) do |node, vector| | |
origin = node.feature.split(",")[6] | |
vector[WordDic[origin]] = 1 | |
end | |
end | |
end | |
lines = File.read("feedbacks.txt").lines | |
# Remove English feedbacks | |
lines.reject! do |line| | |
line =~ /^[\s\w\(\),]+$/ | |
end | |
# Remove URLs | |
lines.map! do |line| | |
line.chomp | |
.gsub(%r{https?://[\w/\.-]+}, "") # Remove URLs | |
.gsub(/topiv/, "") # Remove "topiv" | |
.gsub(/[\((](\s+)?[\))]/, "") # Remove empty brackets | |
end | |
parsed_sentences = lines.map { |l| Sentence.new(l) } | |
parsed_sentences.each { |s| s.format(WordDic.size) } | |
data = parsed_sentences.map { |s| s.vector } | |
labels = parsed_sentences.map { |s| s.sentence } | |
kmeans = KMeansClusterer.run(ARGV[0].to_i, data, labels: labels, runs: 10) | |
kmeans.clusters.each do |cluster| | |
puts "================" | |
puts "CLUSTER: #{cluster.id}" | |
puts cluster.points.map(&:label) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment