Last active
November 29, 2020 04:53
-
-
Save shinysu/d8466aa3907d146b8c34a5f4444cc10a to your computer and use it in GitHub Desktop.
Web Page Analyzer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
bot.py : Given an url, analyses the webpage and gives the following information about the page | |
1. Number of sentences in the webpage | |
2. Number of words in the page | |
3. Number of unique words in the page | |
4. 5 most frequent words in the page | |
""" | |
import PySimpleGUI as sg | |
from webutils import get_content_from_url, parse_html_by_tags | |
from utils import get_statistics | |
layout = [ | |
[sg.Text("Enter URL", font=("Arial", 14)), sg.InputText("", key="url", font=("Arial", 14)), | |
sg.Button("Get Data", font=("Arial", 14), key='get', bind_return_key=True)], | |
[sg.Multiline(key="output", font=("Arial", 14), size=(60, 15), disabled=True)], | |
] | |
def get_details(url): | |
""" | |
Gets the webpage content for the url and determines the count of lines, words, unique words and the top 5 most | |
frequent words from the page | |
:param url: url of the web page | |
""" | |
html_page = get_content_from_url(url) | |
data = parse_html_by_tags(html_page, 'p') | |
statistics = get_statistics(data) | |
display_statistics(statistics) | |
def display_statistics(statistics): | |
""" | |
Analyses the web page of the url and displays the count of lines, word, unique words and the 5 most frequent words | |
:param statistics: the count of lines, words and unique words and the top 5 most frequent words from the page | |
""" | |
window['output'].Update('') | |
window['output'].print("The web page consists of the following information:\n") | |
window['output'].print(statistics['line_count'], "sentences") | |
window['output'].print(statistics['words_count'], "words") | |
window['output'].print(statistics['unique_words'], "unique words") | |
window['output'].print("\nThe top words are\n") | |
for word, count in statistics['top_words']: | |
window['output'].print(word) | |
if __name__ == '__main__': | |
window = sg.Window('WebPageAnalyzer', layout) | |
while True: | |
event, values = window.Read() | |
if event == sg.WINDOW_CLOSED: | |
break | |
elif event == 'get': | |
get_details(values['url']) | |
window.Close() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
from collections import Counter | |
top_n = 5 | |
def get_statistics(data): | |
""" | |
Analyses the webpage data and returns the count of lines, words and unique words and the top 5 most frequent words | |
from the page | |
:param data: the data from the webpage that is to be analysed | |
:return: the count of lines, words and unique words and the top 5 most frequent words from the page | |
""" | |
lines = get_lines(data) | |
words, unique_words = get_words(lines) | |
top_words = get_top_words(words, top_n) | |
statistics = {'line_count': len(lines), 'words_count': len(words), 'unique_words': len(unique_words), | |
'top_words': top_words} | |
return statistics | |
def clean_string(inp_str): | |
""" | |
Cleans the string; removes the punctuations in the string, converts the string to lower case and splits the string | |
into words | |
:param inp_str: the input line that is to be cleaned | |
:return: a list of words in the input string | |
""" | |
st = str.maketrans('“–”', ' ', string.punctuation) | |
return inp_str.translate(st).lower().strip().split() | |
def get_lines(text): | |
""" | |
splits the input string into sentences | |
:param text: input string | |
:return: a list of sentences in the input string | |
""" | |
lines = [] | |
for parah in text: | |
para_lines = re.split(r'[.!?]+', parah) | |
lines.extend(para_lines) | |
return lines | |
def get_words(lines): | |
""" | |
Determines the number of words and the count of unique words in the input string | |
:param lines: input string | |
:return: | |
1. list of words in the string | |
2. list of unique words in the string | |
""" | |
words = [] | |
for line in lines: | |
line_words = clean_string(line) | |
words.extend(line_words) | |
unique_words = list(set(words)) | |
return words, unique_words | |
def get_top_words(words, top_n): | |
""" | |
Gets the top n most frequent words from a given list of words | |
:param words: list of words | |
:param top_n: the count of most frequent words to be returned | |
:return: the n most frequent words and their frequency of occurance | |
""" | |
stopwords = get_stop_words() | |
words_cleaned = [word for word in words if word not in stopwords] | |
return Counter(words_cleaned).most_common(top_n) | |
def get_stop_words(): | |
""" | |
Reads the file 'stopwords.txt' and gets the list of stopwords | |
:return: list of stopwords | |
""" | |
with open('stopwords.txt', "r") as fp: | |
words = fp.readlines() | |
stopwords = [word.rstrip('\n') for word in words] | |
return stopwords |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
def get_content_from_url(url): | |
""" | |
gets the html content from the webpage | |
:param url: url of the webpage | |
:return: HTML content of the page | |
""" | |
try: | |
url = url if url.startswith('http') else ('http://' + url) | |
return requests.get(url).content | |
except requests.ConnectionError as e: | |
print("Connection Error: Could not connect to the server or not found.\n") | |
print(str(e)) | |
except requests.Timeout as e: | |
print("Timeout Error") | |
print(str(e)) | |
except requests.RequestException as e: | |
print("General Error") | |
print(str(e)) | |
except KeyboardInterrupt: | |
print("Keyboard interrupt") | |
def parse_html_by_tags(html, tag): | |
""" | |
parses the HTML page and gets the text of the given tag | |
:param html: the html content that is to be parsed | |
:param tag: the tag for which the text data is to be returned | |
:return: text data of the corresponding tag from the webpage | |
""" | |
data = [] | |
if html: | |
soup = BeautifulSoup(html, "html.parser") | |
for para in soup.find_all(tag): | |
data.append(para.text) | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment