Last active
February 12, 2022 13:26
-
-
Save shinysu/0ff8d1cdf6173f2ab4ee3ec0746a5116 to your computer and use it in GitHub Desktop.
webPageAnalyzer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PySimpleGUI as sg | |
from utils import get_statistics | |
layout = [ | |
[sg.Text("Enter the URL: ", font=('Arial','16')), | |
sg.Input("", font=('Arial','16'), size=(40,1), key='url'), | |
sg.Button("Get Data", font=('Arial','16'), key='get')], | |
[sg.Multiline("", font=('Arial','16'), size=(70, 15), key='output')] | |
] | |
def display_analytics(): | |
url = values['url'] | |
statistics = get_statistics(url) | |
#{'lines': len(lines), 'words': len(words), 'unique_words': len(unique_words), | |
# 'most_common_words': most_common_words} | |
display_values(statistics) | |
def display_values(statistics): | |
window['output'].print("The web page contains the following information\n") | |
window['output'].print(statistics['lines'], "sentences") | |
window['output'].print(statistics['words'], "words") | |
window['output'].print(statistics['unique_words'], "unique words") | |
window['output'].print("The most common words are:") | |
for word, count in statistics['most_common_words']: | |
window['output'].print(word, "-", count) | |
if __name__ == '__main__': | |
window = sg.Window("WebPageAnalyzer", layout) | |
while True: | |
button, values = window.Read() | |
if button == sg.WINDOW_CLOSED: | |
break | |
elif button == 'get': | |
display_analytics() | |
window.Close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
from webutils import get_html_content, parse_html_page | |
import re | |
from collections import Counter | |
def get_statistics(url): | |
content = get_html_content(url) | |
data = parse_html_page(content) | |
lines = get_lines(data) | |
words = get_words(lines) | |
unique_words = set(words) | |
most_common_words = get_most_common_words(words) | |
def get_lines(data): | |
lines =[] | |
for para in data: | |
para_lines = re.split(r'[.!?]+', para) | |
for line in para_lines: | |
if line != '': | |
lines.append(line) | |
return lines | |
def get_words(lines): | |
words = [] | |
for line in lines: | |
cleaned_line = remove_punctuation(line) | |
words.extend(cleaned_line.split()) | |
return words | |
def remove_punctuation(line): | |
st = str.maketrans("","",string.punctuation) | |
cleaned_line = line.translate(st).strip() | |
return cleaned_line | |
def get_most_common_words(words): | |
cleaned_words = [] | |
stopwords = get_stop_words() | |
for word in words: | |
if word not in stopwords: | |
cleaned_words.append(word) | |
#print(cleaned_words) | |
print(Counter(cleaned_words).most_common(5)) | |
def get_stop_words(): | |
with open('/Users/shinysuresh/Documents/KCGPC_FirstYear/Batch2-PySimpleGUI/PySimpleGUI-Programs/webscrapping_demo/stopwords.txt','r') as fp: | |
words = fp.readlines() | |
stopwords = [word.strip('\n') for word in words] | |
return stopwords | |
if __name__ == "__main__": | |
url = "https://realpython.com/beautiful-soup-web-scraper-python/" | |
get_statistics(url) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
def get_html_content(url): | |
page = requests.get(url) | |
return page.content | |
def parse_html_page(content): | |
data = [] | |
soup = BeautifulSoup(content, 'html.parser') | |
para_data = soup.find_all('p') | |
for para in para_data: | |
data.append(para.text) | |
#print(data) | |
return data | |
if __name__ == "__main__": | |
url = "https://realpython.com/beautiful-soup-web-scraper-python/" | |
content = get_html_content(url) | |
parse_html_page(content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment