Created
October 4, 2018 08:08
-
-
Save infinityfuture/67d70f5e90048b347854a2033cbe367a to your computer and use it in GitHub Desktop.
TextRank extract keywords using word2vec as similarity
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Reference: | |
http://www.hankcs.com/nlp/textrank-algorithm-to-extract-the-keywords-java-implementation.html | |
http://www.hankcs.com/nlp/textrank-algorithm-java-implementation-of-automatic-abstract.html | |
Chinese Embedding From | |
https://github.com/Embedding/Chinese-Word-Vectors | |
""" | |
import gensim | |
import numpy as np | |
def similar(word_a, word_b, model): | |
return model.similarity(word_a, word_b) | |
def new_ws(i, word_i, ws, word_near, model, d=0.85): | |
s = 0 | |
for j, word_j in enumerate(word_near[word_i]): | |
if word_j == word_i: | |
continue | |
w_j_i = similar(word_j, word_i, model) | |
w_sum_j_k = 0 | |
for word_k in word_near[word_j]: | |
if word_k == word_j: | |
continue | |
w_sum_j_k += similar(word_j, word_k, model) | |
s += w_j_i / w_sum_j_k * ws[j] | |
s = (1 - d) + d * s | |
return s | |
model = gensim.models.KeyedVectors.load_word2vec_format( | |
'sgns.baidubaike.bigram-char', binary=False) | |
words = [ | |
'程序员', | |
'英文', | |
'程序', | |
'开发', | |
'维护', | |
'专业', | |
'人员', | |
'程序员', | |
'分为', | |
'程序', | |
'设计', | |
'人员', | |
'程序', | |
'编码', | |
'人员', | |
'界限', | |
'特别', | |
'中国', | |
'软件', | |
'人员', | |
'分为', | |
'程序员', | |
'高级', | |
'程序员', | |
'系统', | |
'分析员', | |
'项目', | |
'经理' | |
] | |
word_near = {} | |
for i, word in enumerate(words): | |
if word not in word_near: | |
word_near[word] = set() | |
start_ind = max(i - 5, 0) | |
end_ind = min(i + 5, len(words)) | |
for j in range(start_ind, end_ind): | |
if words[j] != word: | |
word_near[word].add(words[j]) | |
words = sorted(list(word_near.keys())) | |
weight = np.ones(len(words)) | |
max_iter = 200 | |
tol = 1e-3 | |
for i in range(max_iter): | |
new_weight = np.array([ | |
new_ws(i, word_i, weight, word_near, model) | |
for i, word_i in enumerate(words) | |
]) | |
if np.sum((weight - new_weight) ** 2) < tol: | |
break | |
weight = new_weight | |
print(sorted(list(zip(words, weight)), key=lambda x: x[1], reverse=True)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment