Skip to content

Instantly share code, notes, and snippets.

@VikParuchuri
Created October 15, 2013 20:45
Show Gist options
  • Save VikParuchuri/6998391 to your computer and use it in GitHub Desktop.
Save VikParuchuri/6998391 to your computer and use it in GitHub Desktop.
Implement the textrank algorithm in Python.
import networkx as nx
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
SENTENCES_IN_SUMMARY = 10
MIN_SENTENCE_LENGTH = 50
MAX_SENTENCE_LENGTH = 200
def f7(seq):
seen = set()
seen_add = seen.add
return [ x for x in seq if x not in seen and not seen_add(x)]
def textrank(article):
sentence_tokenizer = PunktSentenceTokenizer()
sentences = sentence_tokenizer.tokenize(article)
subbed_sentences = [s.lower() for s in sentences]
bow_matrix = CountVectorizer().fit_transform(subbed_sentences)
normalized = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized * normalized.T
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph)
sorted_scores = sorted(((scores[i],s) for i,s in enumerate(sentences)),
reverse=True)
sorted_scores = [s for s in sorted_scores if len(s[1])>MIN_SENTENCE_LENGTH and len(s[1])<MAX_SENTENCE_LENGTH]
return " ".join(f7([sorted_scores[i][1] for i in range(settings.SENTENCES_IN_SUMMARY)]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment