Created
February 27, 2014 20:35
-
-
Save mcrisc/9258981 to your computer and use it in GitHub Desktop.
Centroid-based Text Summarization
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #coding: utf-8 | |
| """ | |
| Centroid-based Summarization | |
| [Jurafsky & Martin, 2nd ed, ch23, sec23.4.1] | |
| Author: Marcelo Criscuolo (criscuolo[dot]marcelo[at]gmail[dot]com) | |
| Date: 2013-07-19 | |
| """ | |
| from __future__ import print_function | |
| import glob | |
| import re | |
| import sys | |
| import math | |
| from collections import Counter | |
| SUMMARY_SENTENCES = 4 | |
| def segmentize(text): | |
| sentences = [] | |
| begin = 0 | |
| for match in re.finditer(r'([^\s.]*)\s*(\.+)\s*([^\s.]?)', text): | |
| prefix = match.group(1) | |
| suffix = match.group(3) | |
| if is_eos(prefix, suffix, match.group(0)): | |
| end = match.start(3) | |
| sentence = text[begin:end].strip() | |
| begin = end | |
| sentences.append(sentence) | |
| return [s for s in sentences if len(s) > 1] # FIXME len(s) > 1: SUCH A SHAME!!! | |
| def is_eos(prefix, suffix, region): | |
| # domains? | |
| if re.search(r'\S\.\S', region): | |
| return False | |
| # numbers? | |
| if len(suffix) > 0 and suffix[0].isdigit(): | |
| return False | |
| # abbreviations? | |
| if len(prefix) in range(1, 5) and prefix[0].isupper(): | |
| return False | |
| return True | |
| def load_text(filename): | |
| with open(filename) as fin: | |
| text = fin.read() | |
| start = text.find('title:') + len('title:') | |
| finish = text.find('\n', start) | |
| title = text[start:finish].strip() | |
| text = text[finish:] | |
| return (title, text) | |
| def words(text): | |
| return [w.lower() for w in re.findall(r'[^\s.:,;!()?/\'"]+', text)] | |
| def compute_idf(): | |
| idf = {} | |
| filecount = 0.0 | |
| for filename in glob.glob('artigo*.txt'): | |
| filecount += 1 | |
| title, text = load_text(filename) | |
| for w in set(words(text)): | |
| idf[w] = idf.get(w, 0) + 1 | |
| for w in idf.keys(): | |
| idf[w] = math.log(filecount / idf[w]) | |
| return idf | |
| def compute_tf(text): | |
| counter = Counter(words(text)) | |
| total_words = sum(c for w, c in counter.items()) | |
| tf = dict((w, float(c)/total_words) for w, c in counter.items()) | |
| return tf | |
| def cosinesim(idf, text1, text2): | |
| tf1 = compute_tf(text1) | |
| words1 = [w for w in tf1.keys()] | |
| tf2 = compute_tf(text2) | |
| words2 = [w for w in tf2.keys()] | |
| dotprod = sum(tf1.get(w, 0) * tf2.get(w, 0) * (idf[w] ** 2) for w in | |
| set(words1 + words2)) | |
| norm1 = math.sqrt(sum((tf1[w] * idf[w])**2 for w in words1)) | |
| norm2 = math.sqrt(sum((tf2[w] * idf[w])**2 for w in words2)) | |
| return dotprod / (norm1 * norm2) | |
| def rank(idf, sentences): | |
| scount = len(sentences) | |
| sim_matrix = [[0] * scount for i in range(scount)] | |
| for i in range(scount-1): | |
| for j in range(i+1, scount): | |
| sim_matrix[i][j] = cosinesim(idf, sentences[i], sentences[j]) | |
| sim_matrix[j][i] = sim_matrix[i][j] | |
| ranking = [] | |
| for spos in range(scount): | |
| avg_sim = sum(sim_matrix[spos]) / scount | |
| ranking.append((spos, avg_sim)) | |
| ranking.sort(key=lambda t: t[1], reverse=True) | |
| return ranking | |
| def show_tf(): | |
| filename = sys.argv[1] | |
| idf = compute_idf() | |
| title, text = load_text(filename) | |
| tf = compute_tf(text) | |
| for w, c in tf.items(): | |
| print(c * idf.get(w, 0), w) | |
| def main(): | |
| filename = sys.argv[1] | |
| idf = compute_idf() | |
| title, text = load_text(filename) | |
| sentences = segmentize(text) | |
| ranking = rank(idf, sentences) | |
| relevant = [s for s, r in ranking[:SUMMARY_SENTENCES]] | |
| relevant.sort() | |
| # the order the sentences occur originally | |
| print() | |
| print('==', title, '==') | |
| print() | |
| for s in relevant: | |
| print(sentences[s]) | |
| print() | |
| print() | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment