Last active
October 9, 2017 23:28
-
-
Save ksasao/5219c9912120aba9b71cc320a40d3d5d to your computer and use it in GitHub Desktop.
入力された語句に近い既知の語彙を返す
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 実行例 | |
電気 | |
=> 電気 | |
照明 | |
=> 電気 | |
トイレ | |
=> 台所 | |
パソコン | |
=> テレビ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
#------------------------------------------------------------ | |
# 入力された語句に近い既知の語彙を返す(@ksasao) | |
#------------------------------------------------------------ | |
# pixiv小説の本文データ約7億文字から形態素解析した学習データを | |
# 利用しています。 | |
# http://inside.pixiv.net/entry/2016/09/13/161454 | |
#------------------------------------------------------------ | |
# 使い方: | |
# 1. pip install gensim しておきます | |
# 2. https://github.com/pixiv/pixivnovel2vec/releases から | |
# doc2vecで始まる 3つのファイルをダウンロードして同じフォルダに | |
# 置いてください | |
import gensim | |
import sys | |
args = sys.argv | |
# 既知の語彙 | |
#known_words = args | |
known_words = [u'電気', u'台所', u'テレビ'] | |
# 語彙の類似度の閾値 | |
limit = 0.8 | |
def find_similar_word(word): | |
# 既知の語彙の場合 | |
if word in known_words: | |
return word | |
try: | |
# 類義語の中に既知の語彙が含まれる場合 | |
query = model.most_similar(positive=[word]) | |
for r in query: | |
if r[1] > limit and r[0] in known_words: | |
return r[0] | |
finally: | |
# 類義語から既知の語彙の類義語をマッチ(重い) | |
for s in known_words: | |
try: | |
query2 = model.most_similar(positive=[s]) | |
for t in query: | |
for u in query2: | |
if t[1] > limit and u[1] > limit and t[0] == u[0]: | |
return s | |
except: | |
# 見つからなければ次の語で探しなおす | |
continue | |
return "" | |
model = gensim.models.doc2vec.Doc2Vec.load('doc2vec.model') | |
while 1: | |
word = input() | |
result = find_similar_word(word) | |
print("=> " + result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment