neverstew · August 14, 2023 19:33
diff --git a/main.py b/main.py
 from collections import Counter
 import re
 import nltk
 from nltk.corpus import stopwords
 import re

 import ssl

 try:
    _create_unverified_https_context = ssl._create_unverified_context
 except AttributeError:
    pass
 else:
    ssl._create_default_https_context = _create_unverified_https_context
 nltk.download('stopwords')

 def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    text = re.sub(r"^\d{2}\/\d{2}\/\d{4}, \d{2}:\d{2} - [\w -]+:", "", text, flags=re.M)
    text = re.sub(r"<Media omitted>", "", text)
    text = re.sub(r"(?:https?://)?(?:[\w]+\.)(?:\.?[\w]{2,})+", "", text)
    return text

 def find_most_common_words(text, n):
    words = re.findall(r'\b\w+\b', text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    word_counts = Counter(filtered_words)
    most_common_words = word_counts.most_common(n)
    return most_common_words

 if __name__ == "__main__":
    text = load_text_file("corpus.txt")
    common_words = find_most_common_words(text, 50)

    print(f"\nTop 50 most common words (excluding stop words):")
    for word, count in common_words:
        print(f"{word}: {count}")
	from collections import Counter
	import re
	import nltk
	from nltk.corpus import stopwords
	import re

	import ssl

	try:
	_create_unverified_https_context = ssl._create_unverified_context
	except AttributeError:
	pass
	else:
	ssl._create_default_https_context = _create_unverified_https_context
	nltk.download('stopwords')

	def load_text_file(file_path):
	with open(file_path, 'r', encoding='utf-8') as file:
	text = file.read()
	text = re.sub(r"^\d{2}\/\d{2}\/\d{4}, \d{2}:\d{2} - [\w -]+:", "", text, flags=re.M)
	text = re.sub(r"<Media omitted>", "", text)
	text = re.sub(r"(?:https?://)?(?:[\w]+\.)(?:\.?[\w]{2,})+", "", text)
	return text

	def find_most_common_words(text, n):
	words = re.findall(r'\b\w+\b', text.lower())
	stop_words = set(stopwords.words('english'))
	filtered_words = [word for word in words if word not in stop_words]
	word_counts = Counter(filtered_words)
	most_common_words = word_counts.most_common(n)
	return most_common_words

	if __name__ == "__main__":
	text = load_text_file("corpus.txt")
	common_words = find_most_common_words(text, 50)

	print(f"\nTop 50 most common words (excluding stop words):")
	for word, count in common_words:
	print(f"{word}: {count}")