shinysu · February 12, 2022 13:26
diff --git a/bot.py b/bot.py
 import PySimpleGUI as sg
 from utils import get_statistics

 layout = [
    [sg.Text("Enter the URL: ", font=('Arial','16')), 
        sg.Input("", font=('Arial','16'), size=(40,1), key='url'), 
        sg.Button("Get Data", font=('Arial','16'), key='get')],
    [sg.Multiline("", font=('Arial','16'), size=(70, 15), key='output')]
 ]

 def display_analytics():
    url = values['url']
    statistics = get_statistics(url)
    #{'lines': len(lines), 'words': len(words), 'unique_words': len(unique_words), 
    #        'most_common_words': most_common_words}
    display_values(statistics)


 def display_values(statistics):
    window['output'].print("The web page contains the following information\n")
    window['output'].print(statistics['lines'], "sentences")
    window['output'].print(statistics['words'], "words")
    window['output'].print(statistics['unique_words'], "unique words")
    window['output'].print("The most common words are:")
    for word, count  in statistics['most_common_words']:
        window['output'].print(word, "-", count)


 if __name__ == '__main__':
    window = sg.Window("WebPageAnalyzer", layout)
    while True:
        button, values = window.Read()
        if button == sg.WINDOW_CLOSED:
            break
        elif button == 'get':
            display_analytics()
    window.Close()
diff --git a/utils.py b/utils.py
 import string
 from webutils import get_html_content, parse_html_page
 import re
 from collections import Counter

 def get_statistics(url):
    content = get_html_content(url)
    data = parse_html_page(content)
    lines = get_lines(data)
    words = get_words(lines)
    unique_words = set(words)
    most_common_words = get_most_common_words(words)


 def get_lines(data):
    lines =[]
    for para in data:
        para_lines = re.split(r'[.!?]+', para)
        for line in para_lines:
            if line != '':
                lines.append(line)
    return lines


 def get_words(lines):
    words = []
    for line in lines:
        cleaned_line = remove_punctuation(line)
        words.extend(cleaned_line.split())
    return words


 def remove_punctuation(line):
    st = str.maketrans("","",string.punctuation)
    cleaned_line = line.translate(st).strip()
    return cleaned_line


 def get_most_common_words(words):
    cleaned_words = []
    stopwords = get_stop_words()
    for word in words:
        if word not in stopwords:
            cleaned_words.append(word)
    #print(cleaned_words)
    print(Counter(cleaned_words).most_common(5))
    


 def get_stop_words():
    with open('/Users/shinysuresh/Documents/KCGPC_FirstYear/Batch2-PySimpleGUI/PySimpleGUI-Programs/webscrapping_demo/stopwords.txt','r') as fp:
        words = fp.readlines()
        stopwords = [word.strip('\n') for word in words]
    return stopwords


 if __name__ == "__main__":  
    url = "https://realpython.com/beautiful-soup-web-scraper-python/"
    get_statistics(url)

diff --git a/webutils.py b/webutils.py
 import requests
 from bs4 import BeautifulSoup

 def get_html_content(url):
    page = requests.get(url)
    return page.content


 def parse_html_page(content):
    data = []
    soup = BeautifulSoup(content, 'html.parser')
    para_data = soup.find_all('p')
    for para in para_data:
        data.append(para.text)
    #print(data)
    return data


 if __name__ == "__main__":  
    url = "https://realpython.com/beautiful-soup-web-scraper-python/"
    content = get_html_content(url)
    parse_html_page(content)
	import PySimpleGUI as sg
	from utils import get_statistics

	layout = [
	[sg.Text("Enter the URL: ", font=('Arial','16')),
	sg.Input("", font=('Arial','16'), size=(40,1), key='url'),
	sg.Button("Get Data", font=('Arial','16'), key='get')],
	[sg.Multiline("", font=('Arial','16'), size=(70, 15), key='output')]
	]

	def display_analytics():
	url = values['url']
	statistics = get_statistics(url)
	#{'lines': len(lines), 'words': len(words), 'unique_words': len(unique_words),
	# 'most_common_words': most_common_words}
	display_values(statistics)


	def display_values(statistics):
	window['output'].print("The web page contains the following information\n")
	window['output'].print(statistics['lines'], "sentences")
	window['output'].print(statistics['words'], "words")
	window['output'].print(statistics['unique_words'], "unique words")
	window['output'].print("The most common words are:")
	for word, count in statistics['most_common_words']:
	window['output'].print(word, "-", count)


	if __name__ == '__main__':
	window = sg.Window("WebPageAnalyzer", layout)
	while True:
	button, values = window.Read()
	if button == sg.WINDOW_CLOSED:
	break
	elif button == 'get':
	display_analytics()
	window.Close()
	import string
	from webutils import get_html_content, parse_html_page
	import re
	from collections import Counter

	def get_statistics(url):
	content = get_html_content(url)
	data = parse_html_page(content)
	lines = get_lines(data)
	words = get_words(lines)
	unique_words = set(words)
	most_common_words = get_most_common_words(words)


	def get_lines(data):
	lines =[]
	for para in data:
	para_lines = re.split(r'[.!?]+', para)
	for line in para_lines:
	if line != '':
	lines.append(line)
	return lines


	def get_words(lines):
	words = []
	for line in lines:
	cleaned_line = remove_punctuation(line)
	words.extend(cleaned_line.split())
	return words


	def remove_punctuation(line):
	st = str.maketrans("","",string.punctuation)
	cleaned_line = line.translate(st).strip()
	return cleaned_line


	def get_most_common_words(words):
	cleaned_words = []
	stopwords = get_stop_words()
	for word in words:
	if word not in stopwords:
	cleaned_words.append(word)
	#print(cleaned_words)
	print(Counter(cleaned_words).most_common(5))



	def get_stop_words():
	with open('/Users/shinysuresh/Documents/KCGPC_FirstYear/Batch2-PySimpleGUI/PySimpleGUI-Programs/webscrapping_demo/stopwords.txt','r') as fp:
	words = fp.readlines()
	stopwords = [word.strip('\n') for word in words]
	return stopwords


	if __name__ == "__main__":
	url = "https://realpython.com/beautiful-soup-web-scraper-python/"
	get_statistics(url)
	import requests
	from bs4 import BeautifulSoup

	def get_html_content(url):
	page = requests.get(url)
	return page.content


	def parse_html_page(content):
	data = []
	soup = BeautifulSoup(content, 'html.parser')
	para_data = soup.find_all('p')
	for para in para_data:
	data.append(para.text)
	#print(data)
	return data


	if __name__ == "__main__":
	url = "https://realpython.com/beautiful-soup-web-scraper-python/"
	content = get_html_content(url)
	parse_html_page(content)