Created
October 15, 2013 20:45
-
-
Save VikParuchuri/6998391 to your computer and use it in GitHub Desktop.
Implement the textrank algorithm in Python.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import networkx as nx | |
from nltk.tokenize.punkt import PunktSentenceTokenizer | |
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer | |
SENTENCES_IN_SUMMARY = 10 | |
MIN_SENTENCE_LENGTH = 50 | |
MAX_SENTENCE_LENGTH = 200 | |
def f7(seq): | |
seen = set() | |
seen_add = seen.add | |
return [ x for x in seq if x not in seen and not seen_add(x)] | |
def textrank(article): | |
sentence_tokenizer = PunktSentenceTokenizer() | |
sentences = sentence_tokenizer.tokenize(article) | |
subbed_sentences = [s.lower() for s in sentences] | |
bow_matrix = CountVectorizer().fit_transform(subbed_sentences) | |
normalized = TfidfTransformer().fit_transform(bow_matrix) | |
similarity_graph = normalized * normalized.T | |
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph) | |
scores = nx.pagerank(nx_graph) | |
sorted_scores = sorted(((scores[i],s) for i,s in enumerate(sentences)), | |
reverse=True) | |
sorted_scores = [s for s in sorted_scores if len(s[1])>MIN_SENTENCE_LENGTH and len(s[1])<MAX_SENTENCE_LENGTH] | |
return " ".join(f7([sorted_scores[i][1] for i in range(settings.SENTENCES_IN_SUMMARY)])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment