mocobeta · October 10, 2017 15:45
diff --git a/01_preprocess-jp.py b/01_preprocess-jp.py
 import MeCab
 import sys

 raw_text_file = sys.argv[1]
 preprocessed_file = sys.argv[2]

 # preprocess for pke
 m = MeCab.Tagger()
 with open(raw_text_file) as src:
    with open(preprocessed_file, 'w') as dst:
        for line in src:
            node = m.parseToNode(line.strip())
            while node:
                if node.feature.startswith('BOS'):
                    dst.write('\n')
                else:
                    pos_tag = ','.join(node.feature.split(',')[:2])
                    dst.write('%s/%s ' % (node.surface, pos_tag))
                node = node.next
diff --git a/02_keyphrase-extraction-jp.py b/02_keyphrase-extraction-jp.py
 # to extract non-latin words, you need to install pke from the following branch 
 # and set only_alphanum=False in candidate_selection()
 # https://github.com/mocobeta/pke/tree/allow_not_alphanum_words

 # install
 # $ python -V
 # Python 2.7.13
 # $ pip install git+https://github.com/mocobeta/pke.git@allow_not_alphanum_words

 # this example uses TopicRank
 from pke import TopicRank
 import unicodedata
 import sys

 preprocessed_file = sys.argv[1]

 N = 5

 # create a TopicRank extractor and set the input language to None
 extractor = TopicRank(input_file=preprocessed_file, language=None)

 # load the content of the document, here in pre-processed format
 extractor.read_document(format='preprocessed', stemmer=None, sep='/')

 # select the keyphrase candidates, for TopicRank the longest sequences of 
 # specified pos-tags. here, we use MeCab-IPADIC pos tags.
 extractor.candidate_selection(pos=[u'名詞,一般', u'名詞,固有名詞', u'名詞,サ変接続', u'名詞,数', u'名詞,接尾'],
 								stoplist=[],
 								only_alphanum=False)

 # weight the candidates using a random walk. The threshold parameter sets the
 # minimum similarity for clustering, and the method parameter defines the 
 # linkage method
 extractor.candidate_weighting(threshold=0.74,
 							  method='average')

 # print the n-highest scored candidates
 for u, v in extractor.get_n_best(n=N):
 	print('%s\t%f' % (unicodedata.normalize('NFKC', u).encode('utf-8').replace(' ', ''), v))
	import MeCab
	import sys

	raw_text_file = sys.argv[1]
	preprocessed_file = sys.argv[2]

	# preprocess for pke
	m = MeCab.Tagger()
	with open(raw_text_file) as src:
	with open(preprocessed_file, 'w') as dst:
	for line in src:
	node = m.parseToNode(line.strip())
	while node:
	if node.feature.startswith('BOS'):
	dst.write('\n')
	else:
	pos_tag = ','.join(node.feature.split(',')[:2])
	dst.write('%s/%s ' % (node.surface, pos_tag))
	node = node.next
	# to extract non-latin words, you need to install pke from the following branch
	# and set only_alphanum=False in candidate_selection()
	# https://github.com/mocobeta/pke/tree/allow_not_alphanum_words

	# install
	# $ python -V
	# Python 2.7.13
	# $ pip install git+https://github.com/mocobeta/pke.git@allow_not_alphanum_words

	# this example uses TopicRank
	from pke import TopicRank
	import unicodedata
	import sys

	preprocessed_file = sys.argv[1]

	N = 5

	# create a TopicRank extractor and set the input language to None
	extractor = TopicRank(input_file=preprocessed_file, language=None)

	# load the content of the document, here in pre-processed format
	extractor.read_document(format='preprocessed', stemmer=None, sep='/')

	# select the keyphrase candidates, for TopicRank the longest sequences of
	# specified pos-tags. here, we use MeCab-IPADIC pos tags.
	extractor.candidate_selection(pos=[u'名詞,一般', u'名詞,固有名詞', u'名詞,サ変接続', u'名詞,数', u'名詞,接尾'],
	stoplist=[],
	only_alphanum=False)

	# weight the candidates using a random walk. The threshold parameter sets the
	# minimum similarity for clustering, and the method parameter defines the
	# linkage method
	extractor.candidate_weighting(threshold=0.74,
	method='average')

	# print the n-highest scored candidates
	for u, v in extractor.get_n_best(n=N):
	print('%s\t%f' % (unicodedata.normalize('NFKC', u).encode('utf-8').replace(' ', ''), v))