Created
November 20, 2018 21:02
-
-
Save anmolj7/c995466bfcb03077dd677b9ac82fd8af to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#This code is written and tested in python 2.7 | |
#The library NLTK has to be installed first. | |
import re, nltk, heapq | |
class Summary: | |
def summary(self, article_text, n=5): # n indicates the number of lines of summary required | |
# article_text = re.sub(r'\[[0-9]*\]', ' ', text) | |
# article_text = re.sub(r'\s+', ' ', article_text) | |
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text ) | |
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text) | |
sentence_list = nltk.sent_tokenize(article_text) | |
stopwords = nltk.corpus.stopwords.words('english') | |
word_freq = {} | |
for word in nltk.word_tokenize(formatted_article_text): | |
if word not in stopwords: | |
if word not in word_freq.keys(): | |
word_freq[word] = 1 | |
else: | |
word_freq[word] += 1 | |
maximum_freq = max(word_freq.values()) | |
for word in word_freq.keys(): | |
word_freq[word] = (word_freq[word]/float(maximum_freq)) | |
sentence_scores = {} | |
for sent in sentence_list: | |
for word in nltk.word_tokenize(sent.lower()): | |
if word in word_freq.keys(): | |
if len(sent.split(' ')) < 30: | |
if sent not in sentence_scores.keys(): | |
sentence_scores[sent] = word_freq[word] | |
else: | |
sentence_scores[sent] += word_freq[word] | |
summary_sents = heapq.nlargest(n, sentence_scores, key=sentence_scores.get) | |
summary = ' '.join(summary_sents) | |
return summary | |
article_text = raw_input('Enter The Input Article Text: ') | |
S = Summary() | |
print(S.summary(article_text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment