Created
October 10, 2017 15:45
-
-
Save mocobeta/6b9b745ec1e67843a66160ce54102a8e to your computer and use it in GitHub Desktop.
Japanese keyphrase extraction by pke
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MeCab | |
import sys | |
raw_text_file = sys.argv[1] | |
preprocessed_file = sys.argv[2] | |
# preprocess for pke | |
m = MeCab.Tagger() | |
with open(raw_text_file) as src: | |
with open(preprocessed_file, 'w') as dst: | |
for line in src: | |
node = m.parseToNode(line.strip()) | |
while node: | |
if node.feature.startswith('BOS'): | |
dst.write('\n') | |
else: | |
pos_tag = ','.join(node.feature.split(',')[:2]) | |
dst.write('%s/%s ' % (node.surface, pos_tag)) | |
node = node.next |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# to extract non-latin words, you need to install pke from the following branch | |
# and set only_alphanum=False in candidate_selection() | |
# https://github.com/mocobeta/pke/tree/allow_not_alphanum_words | |
# install | |
# $ python -V | |
# Python 2.7.13 | |
# $ pip install git+https://github.com/mocobeta/pke.git@allow_not_alphanum_words | |
# this example uses TopicRank | |
from pke import TopicRank | |
import unicodedata | |
import sys | |
preprocessed_file = sys.argv[1] | |
N = 5 | |
# create a TopicRank extractor and set the input language to None | |
extractor = TopicRank(input_file=preprocessed_file, language=None) | |
# load the content of the document, here in pre-processed format | |
extractor.read_document(format='preprocessed', stemmer=None, sep='/') | |
# select the keyphrase candidates, for TopicRank the longest sequences of | |
# specified pos-tags. here, we use MeCab-IPADIC pos tags. | |
extractor.candidate_selection(pos=[u'名詞,一般', u'名詞,固有名詞', u'名詞,サ変接続', u'名詞,数', u'名詞,接尾'], | |
stoplist=[], | |
only_alphanum=False) | |
# weight the candidates using a random walk. The threshold parameter sets the | |
# minimum similarity for clustering, and the method parameter defines the | |
# linkage method | |
extractor.candidate_weighting(threshold=0.74, | |
method='average') | |
# print the n-highest scored candidates | |
for u, v in extractor.get_n_best(n=N): | |
print('%s\t%f' % (unicodedata.normalize('NFKC', u).encode('utf-8').replace(' ', ''), v)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment