Skip to content

Instantly share code, notes, and snippets.

@mocobeta
Created October 10, 2017 15:45
Show Gist options
  • Save mocobeta/6b9b745ec1e67843a66160ce54102a8e to your computer and use it in GitHub Desktop.
Save mocobeta/6b9b745ec1e67843a66160ce54102a8e to your computer and use it in GitHub Desktop.
Japanese keyphrase extraction by pke
import MeCab
import sys
raw_text_file = sys.argv[1]
preprocessed_file = sys.argv[2]
# preprocess for pke
m = MeCab.Tagger()
with open(raw_text_file) as src:
with open(preprocessed_file, 'w') as dst:
for line in src:
node = m.parseToNode(line.strip())
while node:
if node.feature.startswith('BOS'):
dst.write('\n')
else:
pos_tag = ','.join(node.feature.split(',')[:2])
dst.write('%s/%s ' % (node.surface, pos_tag))
node = node.next
# to extract non-latin words, you need to install pke from the following branch
# and set only_alphanum=False in candidate_selection()
# https://github.com/mocobeta/pke/tree/allow_not_alphanum_words
# install
# $ python -V
# Python 2.7.13
# $ pip install git+https://github.com/mocobeta/pke.git@allow_not_alphanum_words
# this example uses TopicRank
from pke import TopicRank
import unicodedata
import sys
preprocessed_file = sys.argv[1]
N = 5
# create a TopicRank extractor and set the input language to None
extractor = TopicRank(input_file=preprocessed_file, language=None)
# load the content of the document, here in pre-processed format
extractor.read_document(format='preprocessed', stemmer=None, sep='/')
# select the keyphrase candidates, for TopicRank the longest sequences of
# specified pos-tags. here, we use MeCab-IPADIC pos tags.
extractor.candidate_selection(pos=[u'名詞,一般', u'名詞,固有名詞', u'名詞,サ変接続', u'名詞,数', u'名詞,接尾'],
stoplist=[],
only_alphanum=False)
# weight the candidates using a random walk. The threshold parameter sets the
# minimum similarity for clustering, and the method parameter defines the
# linkage method
extractor.candidate_weighting(threshold=0.74,
method='average')
# print the n-highest scored candidates
for u, v in extractor.get_n_best(n=N):
print('%s\t%f' % (unicodedata.normalize('NFKC', u).encode('utf-8').replace(' ', ''), v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment