Skip to content

Instantly share code, notes, and snippets.

@ikegami-yukino
Forked from nakagami/jabstract.py
Created December 14, 2018 17:20
Show Gist options
  • Save ikegami-yukino/81ea00fcd261660ecc60a8937f077528 to your computer and use it in GitHub Desktop.
Save ikegami-yukino/81ea00fcd261660ecc60a8937f077528 to your computer and use it in GitHub Desktop.
Japanese summarization module using LexRank algorithm.
#!/usr/bin/env python
# The MIT License (MIT)
# Copyright © 2015 Recruit Technologies Co.,Ltd.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import collections
import re
import numpy as np
import networkx as nx
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import pairwise_distances
from janome.tokenizer import Tokenizer
__doc__ = """Japanese summarization module using LexRank algorithm.
This module was reconsturcted from summpy.
- https://pypi.python.org/pypi/summpy/
- https://github.com/recruit-tech/summpy
- https://recruit-tech.co.jp/blog/2015/10/30/summpy-released/
Requirements
- numpy
- networkx
- scipy
- scikit-learn
- janome
"""
__version__ = "0.1.2"
__author__ = "Shumpei IINUMA"
__maintainer__ = "Hajime Nakagami<[email protected]>"
__all__ = ['summarize']
tokenizer = Tokenizer()
def word_splitter_ja(sent):
def _is_stopword(n):
if len(n.surface) == 0:
return True
elif re.search(r'^[\s!-@\[-`\{-~ 、-〜!-@[-`]+$', n.surface):
return True
elif re.search(r'^(接尾|非自立)', n.part_of_speech.split(',')[1]):
return True
elif 'サ変・スル' == n.infl_form or u'ある' == n.base_form:
return True
elif re.search(r'^(名詞|動詞|形容詞)', n.part_of_speech.split(',')[0]):
return False
else:
return True
return [n.base_form for n in tokenizer.tokenize(sent) if not _is_stopword(n)]
def sent_splitter_ja(text, delimiters=set('。.?!\n\r'), parenthesis='()「」『』“”'):
'''
Args:
text: string that contains multiple Japanese sentences.
delimiters: set() of sentence delimiter characters.
parenthesis: to be checked its correspondence.
Returns:
list of sentences.
'''
paren_chars = set(parenthesis)
close2open = dict(zip(parenthesis[1::2], parenthesis[0::2]))
sentences = []
pstack = []
buff = []
for i, c in enumerate(text):
c_next = text[i+1] if i+1 < len(text) else None
# check correspondence of parenthesis
if c in paren_chars:
if c in close2open: # close
if len(pstack) > 0 and pstack[-1] == close2open[c]:
pstack.pop()
else: # open
pstack.append(c)
buff.append(c)
if c in delimiters:
if len(pstack) == 0 and c_next not in delimiters:
s = ''.join(buff).strip()
if s:
sentences.append(s)
buff = []
if len(buff) > 0:
s = ''.join(buff).strip()
if s:
sentences.append(s)
return sentences
def lexrank(
sentences, continuous=False, word_splitter=word_splitter_ja, sim_threshold=0.1, alpha=0.9
):
'''
compute centrality score of sentences.
Args:
sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ]
continuous: if True, apply continuous LexRank. (see reference)
word_splitter: function to spilit to words
sim_threshold: if continuous is False and smilarity is greater or
equal to sim_threshold, link the sentences.
alpha: the damping factor of PageRank
Returns: tuple
(
{
# sentence index -> score
0: 0.003,
1: 0.002,
...
},
similarity_matrix
)
Reference:
Günes Erkan and Dragomir R. Radev.
LexRank: graph-based lexical centrality as salience in text
summarization. (section 3)
http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
'''
# configure ranker
ranker_params = {'max_iter': 1000}
ranker = nx.pagerank_scipy
ranker_params['alpha'] = alpha
graph = nx.DiGraph()
# sentence -> tf
sent_tf_list = []
for sent in sentences:
words = word_splitter(sent)
tf = collections.Counter(words)
sent_tf_list.append(tf)
sent_vectorizer = DictVectorizer(sparse=True)
sent_vecs = sent_vectorizer.fit_transform(sent_tf_list)
# compute similarities between senteces
sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine')
linked_rows, linked_cols = np.where(
sim_mat > 0 if continuous else sim_mat >= sim_threshold
)
# create similarity graph
graph.add_nodes_from(range(sent_vecs.shape[0]))
for i, j in zip(linked_rows, linked_cols):
if i != j:
weight = sim_mat[i, j] if continuous else 1.0
graph.add_edge(i, j, weight=weight)
scores = ranker(graph, **ranker_params)
return scores, sim_mat
def summarize(sentences, sent_limit=None, char_limit=None, imp_require=None, **lexrank_params):
'''
Args:
sentences: text to be summarized or list of sentence
sent_limit: summary length (the number of sentences)
char_limit: summary length (the number of characters)
imp_require: cumulative LexRank score [0.0-1.0]
Returns:
list of extracted sentences
'''
if isinstance(sentences, str):
sentences = sent_splitter_ja(sentences)
scores, sim_mat = lexrank(sentences, **lexrank_params)
sum_scores = sum(scores.values())
acc_scores = 0.0
indexes = set()
num_sent, num_char = 0, 0
for i in sorted(scores, key=lambda i: scores[i], reverse=True):
num_sent += 1
num_char += len(sentences[i])
if sent_limit is not None and num_sent > sent_limit:
break
if char_limit is not None and num_char > char_limit:
break
if imp_require is not None and acc_scores / sum_scores >= imp_require:
break
indexes.add(i)
acc_scores += scores[i]
return [sentences[i] for i in sorted(indexes)]
if __name__ == '__main__':
def _get_bocchan_text():
import io
import requests
import zipfile
r = requests.get('https://www.aozora.gr.jp/cards/000148/files/752_ruby_2438.zip')
f = zipfile.ZipFile(io.BytesIO(r.content)).open('bocchan.txt')
text = f.read().decode('cp932')
text = re.sub(r'《[^》]+》', '', text)
text = re.sub(r'|', '', text)
text = re.sub(r'[.+?]', '', text)
text = re.sub(r'-----[\s\S]*-----', '', text)
text = re.split('底本:', text)[0]
return text
text = _get_bocchan_text()
result = summarize(text, char_limit=400)
print('\n\nsummarize(char_limit=400)')
print('\n'.join(result))
result = summarize(text, sent_limit=10)
print('\n\nsummarize(sent_limit=10)')
print('\n'.join(result))
result = summarize(text, char_limit=400, continuous=True)
print('\n\nsummarize(char_limit=400, continuous=True)')
print('\n'.join(result))
result = summarize(text, sent_limit=10, continuous=True)
print('\n\nsummarize(sent_limit=10, continuous=True)')
print('\n'.join(result))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment