Created
November 13, 2019 12:08
-
-
Save satoryu/0183a4eba365cc67e28988a09f3035b3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'matrix' | |
require 'tf-idf-similarity' | |
class Token | |
def initialize(mecab_node) | |
@node = mecab_node | |
end | |
def valid? | |
@node.feature.start_with?('名詞') | |
end | |
def to_s | |
@node.surface | |
end | |
end | |
# Mecab is a Morphological analysis for Japanese language. | |
# Natto is the gem to utilize Mecab in Ruby. | |
class Tokenizer | |
def tokenize(text) | |
require 'natto' | |
nm = Natto::MeCab.new | |
nm.enum_parse(text).to_a.map do |node| | |
Token.new(node) | |
end | |
end | |
end | |
tokenizer = Tokenizer.new | |
options = { tokenizer: tokenizer } | |
document1 = TfIdfSimilarity::Document.new("友人がコンビニでポッキー特設売り場を発見して一言、「ポッキーの日は推しの為にあるんだよ」とのことなので2次創作用ポッキーやバーチャルポッキーの方が売れる説", options) | |
document2 = TfIdfSimilarity::Document.new("11月11日はポッキー&プリッツの日! ポッキー&\"プリッツ\"の日! 『プリッツ』の日!", options) | |
document3 = TfIdfSimilarity::Document.new("今日は #ポッキープリッツの日 (私はプリッツ派)ですが事務所にはなかったので同メーカー グリコさんの #ビスコ をおやつにいただきまんもす。いちご味あるん知らんかったんやけど、昔からある? ちなみに、ビスコを食べる時は幼い頃から2つに分けてから食べます。", options) | |
corpus = [document1, document2, document3] | |
model = TfIdfSimilarity::TfIdfModel.new(corpus) | |
matrix = model.similarity_matrix | |
puts matrix |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment