Skip to content

Instantly share code, notes, and snippets.

@igor-shevchenko
Created June 20, 2013 08:34
Show Gist options
  • Save igor-shevchenko/5821166 to your computer and use it in GitHub Desktop.
Save igor-shevchenko/5821166 to your computer and use it in GitHub Desktop.
TextRank algorithm for text summarization.
from itertools import combinations
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from nltk.stem.snowball import RussianStemmer
import networkx as nx
def similarity(s1, s2):
if not len(s1) or not len(s2):
return 0.0
return len(s1.intersection(s2))/(1.0 * (len(s1) + len(s2)))
def textrank(text):
sentences = sent_tokenize(text)
tokenizer = RegexpTokenizer(r'\w+')
lmtzr = RussianStemmer()
words = [set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower()))
for sentence in sentences]
pairs = combinations(range(len(sentences)), 2)
scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
scores = filter(lambda x: x[2], scores)
g = nx.Graph()
g.add_weighted_edges_from(scores)
pr = nx.pagerank(g)
return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr), key=lambda x: pr[x[0]], reverse=True)
def extract(text, n=5):
tr = textrank(text)
top_n = sorted(tr[:n])
return ' '.join(x[2] for x in top_n)
@igor-shevchenko
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment