Last active
February 6, 2018 14:56
-
-
Save nkt1546789/5c52160f210df3c7724c to your computer and use it in GitHub Desktop.
Basic summarization model on Python.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 -*- | |
| import MeCab | |
| import numpy as np | |
| m = MeCab.Tagger("-Ochasen") | |
| def sent_tokenize(text): | |
| if type(text) is unicode: | |
| text = text.encode("utf8") | |
| node = m.parseToNode(text) | |
| sentences = [] | |
| sentence = [] | |
| while node: | |
| sentence.append(unicode(node.surface, "utf8")) | |
| if node.surface == "。": | |
| sentences.append(sentence) | |
| sentence = [] | |
| node = node.next | |
| return sentences | |
| def get_freqdict(sentences): | |
| freqdict = {} | |
| N = 0 | |
| for sentence in sentences: | |
| for word in sentence: | |
| freqdict.setdefault(word, 0.) | |
| freqdict[word] += 1 | |
| N += 1 | |
| return freqdict | |
| def score(sentence, freqdict): | |
| return np.sum([np.log(freqdict[word]) for word in sentence]) / len(sentence) | |
| def direct_proportion(i, n): | |
| return float(n-i+1)/n | |
| def inverse_proportion(i, n): | |
| return 1.0 / i | |
| def geometric_sequence(i, n): | |
| return 0.5 ** (i-1) | |
| def inverse_entropy(p): | |
| if p == 1.0 or 0.0: | |
| return 1.0 | |
| return 1-(-p*np.log(p) - (1-p)*np.log(1-p)) | |
| def inverse_entropy_proportion(i, n): | |
| p = i / n | |
| return inverse_entropy(p) | |
| def summarize(text, limit=100, **options): | |
| """ | |
| text: target text | |
| limit: summary length limit | |
| option: | |
| -m: summarization mode | |
| 0: basic summarization model | |
| 1: using word position feature | |
| -f: feature function | |
| 0: direct proportion (DP) | |
| 1: inverse proportion (IP) | |
| 2: Geometric sequence (GS) | |
| 3: Binary function (BF) | |
| 4: Inverse entropy | |
| """ | |
| sentences = sent_tokenize(text) | |
| freqdict = get_freqdict(sentences) | |
| if options["m"] == 0: | |
| scores = [score(sentence, freqdict) for sentence in sentences] | |
| if options["m"] == 1: | |
| if options["f"] == 0: | |
| word_features = direct_proportion | |
| elif options["f"] == 1: | |
| word_features = inverse_proportion | |
| elif options["f"] == 2: | |
| word_features = geometric_sequence | |
| elif options["f"] == 4: | |
| word_features = inverse_entropy_proportion | |
| scores = [] | |
| feature_dict = {} | |
| for sentence in sentences: | |
| sent_score = 0.0 | |
| for word in sentence: | |
| feature_dict.setdefault(word, 0.0) | |
| feature_dict[word] += 1 | |
| sent_score += np.log(freqdict[word]) * word_features(feature_dict[word], freqdict[word]) | |
| sent_score /= len(sentence) | |
| scores.append(sent_score) | |
| topics = [] | |
| length = 0 | |
| for index in sorted(range(len(scores)), key=lambda k: scores[k], reverse=True): | |
| length += len(sentences[index]) | |
| if length > limit: break | |
| topics.append(index) | |
| topics = sorted(topics) | |
| return "".join(["".join(sentences[topic]) for topic in topics]) | |
| if __name__ == '__main__': | |
| """ | |
| A text below comes from http://www.lifehacker.jp/2014/01/140121tabroid_dionote.html . | |
| """ | |
| test_title = u"真に「使える」手書きメモアプリだと思わせてくれた『DioNote』" | |
| test_text = u"""Android:手書きメモアプリが使えないのは過去の話になったかも。 | |
| 「手書きメモアプリ」と聞くだけで、筆者は敬遠するところがありました。今までいくつかのアプリを試してきて、うまく文字が書けたり、正しく反映されたためしがなかったのです。大人しくキーボードから入力するメモが一番だ、と。 | |
| ですが、今回紹介する『DioNote』は、手書き反映の機敏さといい、認識力の高さといい、かなりの実力を持っていて、久々に「いいね!」と言いたくなるアプリでした。 | |
| 加えて、画像の挿入や文字入力、メモのショートカットをホームに置けるなど、細かな機能も実装されており、あらゆる点からなかなか使える仕上がりとなっています。 | |
| 早速、トップ画面右上のプラスマークからメモを作ってみます。ノートのようなデザインです。画面下部の領域に文字を手書きで入力していきます。一文字書いてみると、反応の正確さにビックリします。 | |
| すぐさま一文字書いたことが認識され、新たな文字、さらに新たな文字...と、そのテンポの良さも素晴らしい。ちなみに、一文字ずつだけでなく、横に連続で書いていくことも可能です。 | |
| 画面右上のメニューから「キャンパス作成」をタップすると、真っさらな自由帳のような画面になります。ここでは画像の貼り付けも自由にでき、より気ままなメモを作成できます。 | |
| """ | |
| print test_title | |
| print "====================================================================================================" | |
| print test_text | |
| print test_title | |
| print "====================================================================================================" | |
| print summarize(test_text, m=0) | |
| print test_title | |
| print "====================================================================================================" | |
| print summarize(test_text, m=1, f=0) | |
| print test_title | |
| print "====================================================================================================" | |
| print summarize(test_text, m=1, f=1) | |
| print test_title | |
| print "====================================================================================================" | |
| print summarize(test_text, m=1, f=2) | |
| print test_title | |
| print "====================================================================================================" | |
| print summarize(test_text, m=1, f=4) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment