shinysu · November 29, 2020 04:53
diff --git a/bot.py b/bot.py
 """
 bot.py : Given an url, analyses the webpage and gives the following information about the page
            1. Number of sentences in the webpage
            2. Number of words in the page
            3. Number of unique words in the page
            4. 5 most frequent words in the page
 """

 import PySimpleGUI as sg
 from webutils import get_content_from_url, parse_html_by_tags
 from utils import get_statistics


 layout = [
            [sg.Text("Enter URL", font=("Arial", 14)), sg.InputText("", key="url", font=("Arial", 14)),
             sg.Button("Get Data", font=("Arial", 14), key='get', bind_return_key=True)],
            [sg.Multiline(key="output", font=("Arial", 14), size=(60, 15), disabled=True)],
        ]


 def get_details(url):
    """
    Gets the webpage content for the url and determines the count of lines, words, unique words and the top 5 most
    frequent words from the page
    :param url: url of the web page
    """
    html_page = get_content_from_url(url)
    data = parse_html_by_tags(html_page, 'p')
    statistics = get_statistics(data)
    display_statistics(statistics)


 def display_statistics(statistics):
    """
    Analyses the web page of the url and displays the count of lines, word, unique words and the 5 most frequent words
    :param statistics: the count of lines, words and unique words and the top 5 most frequent words from the page
    """
    window['output'].Update('')
    window['output'].print("The web page consists of the following information:\n")
    window['output'].print(statistics['line_count'], "sentences")
    window['output'].print(statistics['words_count'], "words")
    window['output'].print(statistics['unique_words'], "unique words")
    window['output'].print("\nThe top words are\n")
    for word, count in statistics['top_words']:
        window['output'].print(word)


 if __name__ == '__main__':
    window = sg.Window('WebPageAnalyzer', layout)
    while True:
        event, values = window.Read()
        if event == sg.WINDOW_CLOSED:
            break
        elif event == 'get':
            get_details(values['url'])
    window.Close()

diff --git a/utils.py b/utils.py
 import re
 import string
 from collections import Counter
 top_n = 5


 def get_statistics(data):
    """
    Analyses the webpage data and returns the count of lines, words and unique words and the top 5 most frequent words
    from the page
    :param data: the data from the webpage that is to be analysed
    :return: the count of lines, words and unique words and the top 5 most frequent words from the page
    """
    lines = get_lines(data)
    words, unique_words = get_words(lines)
    top_words = get_top_words(words, top_n)
    statistics = {'line_count': len(lines), 'words_count': len(words), 'unique_words': len(unique_words),
                  'top_words': top_words}
    return statistics


 def clean_string(inp_str):
    """
    Cleans the string; removes the punctuations in the string, converts the string to lower case and splits the string
    into words
    :param inp_str: the input line that is to be cleaned
    :return: a list of words in the input string
    """
    st = str.maketrans('“–”', '   ', string.punctuation)
    return inp_str.translate(st).lower().strip().split()


 def get_lines(text):
    """
    splits the input string into sentences
    :param text: input string
    :return: a list of sentences in the input string
    """
    lines = []
    for parah in text:
        para_lines = re.split(r'[.!?]+', parah)
        lines.extend(para_lines)
    return lines


 def get_words(lines):
    """
    Determines the number of words and the count of unique words in the input string
    :param lines: input string
    :return:
        1. list of words in the string
        2. list of unique words in the string
    """
    words = []
    for line in lines:
        line_words = clean_string(line)
        words.extend(line_words)
    unique_words = list(set(words))
    return words, unique_words


 def get_top_words(words, top_n):
    """
    Gets the top n most frequent words from a given list of words
    :param words: list of words
    :param top_n: the count of most frequent words to be returned
    :return: the n most frequent words and their frequency of occurance
    """
    stopwords = get_stop_words()
    words_cleaned = [word for word in words if word not in stopwords]
    return Counter(words_cleaned).most_common(top_n)


 def get_stop_words():
    """
    Reads the file 'stopwords.txt' and gets the list of stopwords
    :return: list of stopwords
    """
    with open('stopwords.txt', "r") as fp:
        words = fp.readlines()
        stopwords = [word.rstrip('\n') for word in words]
    return stopwords
diff --git a/webutils.py b/webutils.py
 from bs4 import BeautifulSoup
 import requests


 def get_content_from_url(url):
    """
    gets the html content from the webpage
    :param url: url of the webpage
    :return: HTML content of the page
    """
    try:
        url = url if url.startswith('http') else ('http://' + url)
        return requests.get(url).content
    except requests.ConnectionError as e:
        print("Connection Error: Could not connect to the server or not found.\n")
        print(str(e))
    except requests.Timeout as e:
        print("Timeout Error")
        print(str(e))
    except requests.RequestException as e:
        print("General Error")
        print(str(e))
    except KeyboardInterrupt:
        print("Keyboard interrupt")


 def parse_html_by_tags(html, tag):
    """
    parses the HTML page and gets the text of the given tag
    :param html: the html content that is to be parsed
    :param tag: the tag for which the text data is to be returned
    :return: text data of the corresponding tag from the webpage
    """
    data = []
    if html:
        soup = BeautifulSoup(html, "html.parser")
        for para in soup.find_all(tag):
            data.append(para.text)
    return data
	"""
	bot.py : Given an url, analyses the webpage and gives the following information about the page
	1. Number of sentences in the webpage
	2. Number of words in the page
	3. Number of unique words in the page
	4. 5 most frequent words in the page
	"""

	import PySimpleGUI as sg
	from webutils import get_content_from_url, parse_html_by_tags
	from utils import get_statistics


	layout = [
	[sg.Text("Enter URL", font=("Arial", 14)), sg.InputText("", key="url", font=("Arial", 14)),
	sg.Button("Get Data", font=("Arial", 14), key='get', bind_return_key=True)],
	[sg.Multiline(key="output", font=("Arial", 14), size=(60, 15), disabled=True)],
	]


	def get_details(url):
	"""
	Gets the webpage content for the url and determines the count of lines, words, unique words and the top 5 most
	frequent words from the page
	:param url: url of the web page
	"""
	html_page = get_content_from_url(url)
	data = parse_html_by_tags(html_page, 'p')
	statistics = get_statistics(data)
	display_statistics(statistics)


	def display_statistics(statistics):
	"""
	Analyses the web page of the url and displays the count of lines, word, unique words and the 5 most frequent words
	:param statistics: the count of lines, words and unique words and the top 5 most frequent words from the page
	"""
	window['output'].Update('')
	window['output'].print("The web page consists of the following information:\n")
	window['output'].print(statistics['line_count'], "sentences")
	window['output'].print(statistics['words_count'], "words")
	window['output'].print(statistics['unique_words'], "unique words")
	window['output'].print("\nThe top words are\n")
	for word, count in statistics['top_words']:
	window['output'].print(word)


	if __name__ == '__main__':
	window = sg.Window('WebPageAnalyzer', layout)
	while True:
	event, values = window.Read()
	if event == sg.WINDOW_CLOSED:
	break
	elif event == 'get':
	get_details(values['url'])
	window.Close()
	import re
	import string
	from collections import Counter
	top_n = 5


	def get_statistics(data):
	"""
	Analyses the webpage data and returns the count of lines, words and unique words and the top 5 most frequent words
	from the page
	:param data: the data from the webpage that is to be analysed
	:return: the count of lines, words and unique words and the top 5 most frequent words from the page
	"""
	lines = get_lines(data)
	words, unique_words = get_words(lines)
	top_words = get_top_words(words, top_n)
	statistics = {'line_count': len(lines), 'words_count': len(words), 'unique_words': len(unique_words),
	'top_words': top_words}
	return statistics


	def clean_string(inp_str):
	"""
	Cleans the string; removes the punctuations in the string, converts the string to lower case and splits the string
	into words
	:param inp_str: the input line that is to be cleaned
	:return: a list of words in the input string
	"""
	st = str.maketrans('“–”', ' ', string.punctuation)
	return inp_str.translate(st).lower().strip().split()


	def get_lines(text):
	"""
	splits the input string into sentences
	:param text: input string
	:return: a list of sentences in the input string
	"""
	lines = []
	for parah in text:
	para_lines = re.split(r'[.!?]+', parah)
	lines.extend(para_lines)
	return lines


	def get_words(lines):
	"""
	Determines the number of words and the count of unique words in the input string
	:param lines: input string
	:return:
	1. list of words in the string
	2. list of unique words in the string
	"""
	words = []
	for line in lines:
	line_words = clean_string(line)
	words.extend(line_words)
	unique_words = list(set(words))
	return words, unique_words


	def get_top_words(words, top_n):
	"""
	Gets the top n most frequent words from a given list of words
	:param words: list of words
	:param top_n: the count of most frequent words to be returned
	:return: the n most frequent words and their frequency of occurance
	"""
	stopwords = get_stop_words()
	words_cleaned = [word for word in words if word not in stopwords]
	return Counter(words_cleaned).most_common(top_n)


	def get_stop_words():
	"""
	Reads the file 'stopwords.txt' and gets the list of stopwords
	:return: list of stopwords
	"""
	with open('stopwords.txt', "r") as fp:
	words = fp.readlines()
	stopwords = [word.rstrip('\n') for word in words]
	return stopwords
	from bs4 import BeautifulSoup
	import requests


	def get_content_from_url(url):
	"""
	gets the html content from the webpage
	:param url: url of the webpage
	:return: HTML content of the page
	"""
	try:
	url = url if url.startswith('http') else ('http://' + url)
	return requests.get(url).content
	except requests.ConnectionError as e:
	print("Connection Error: Could not connect to the server or not found.\n")
	print(str(e))
	except requests.Timeout as e:
	print("Timeout Error")
	print(str(e))
	except requests.RequestException as e:
	print("General Error")
	print(str(e))
	except KeyboardInterrupt:
	print("Keyboard interrupt")


	def parse_html_by_tags(html, tag):
	"""
	parses the HTML page and gets the text of the given tag
	:param html: the html content that is to be parsed
	:param tag: the tag for which the text data is to be returned
	:return: text data of the corresponding tag from the webpage
	"""
	data = []
	if html:
	soup = BeautifulSoup(html, "html.parser")
	for para in soup.find_all(tag):
	data.append(para.text)
	return data