Created
January 1, 2020 19:04
-
-
Save TutorialDoctor/c6a815e7d9f7f321879760122b182089 to your computer and use it in GitHub Desktop.
Summarize web pages given a URL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 as bs | |
import urllib.request | |
import re | |
import nltk | |
#Install beautifulsoup | |
#Article this is from: | |
#https://stackabuse.com/text-summarization-with-nltk-in-python/ | |
#NLTK help: | |
#https://stackoverflow.com/questions/4867197/failed-loading-english-pickle-with-nltk-data-load | |
#`pip3 install nltk` | |
#`pip3 install beautifulsoup4` | |
#`pip3 install lxml` | |
scraped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Christmas') | |
article = scraped_data.read() | |
parsed_article = bs.BeautifulSoup(article,'lxml') | |
paragraphs = parsed_article.find_all('p') | |
article_text = "" | |
for p in paragraphs: | |
article_text += p.text | |
# Removing Square Brackets and Extra Spaces | |
article_text = re.sub(r'\[[0-9]*\]', ' ', article_text) | |
article_text = re.sub(r'\s+', ' ', article_text) | |
# Removing special characters and digits | |
formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text ) | |
formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text) | |
sentence_list = nltk.sent_tokenize(article_text) | |
stopwords = nltk.corpus.stopwords.words('english') | |
word_frequencies = {} | |
for word in nltk.word_tokenize(formatted_article_text): | |
if word not in stopwords: | |
if word not in word_frequencies.keys(): | |
word_frequencies[word] = 1 | |
else: | |
word_frequencies[word] += 1 | |
maximum_frequncy = max(word_frequencies.values()) | |
for word in word_frequencies.keys(): | |
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) | |
sentence_scores = {} | |
for sent in sentence_list: | |
for word in nltk.word_tokenize(sent.lower()): | |
if word in word_frequencies.keys(): | |
if len(sent.split(' ')) < 30: | |
if sent not in sentence_scores.keys(): | |
sentence_scores[sent] = word_frequencies[word] | |
else: | |
sentence_scores[sent] += word_frequencies[word] | |
import heapq | |
summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get) | |
summary = ' '.join(summary_sentences) | |
print(summary) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment