Last active
June 22, 2021 07:41
-
-
Save cjoshmartin/a21a4f957d5032773f99525ad6c90dfa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download file and have python version 3.8 or newer installed | |
# run "python main.py" | |
from pathlib import Path | |
import collections | |
import re | |
def top_n_words(input_file: str, common_words: str, n: int) -> None: | |
input_data = Path(input_file).read_text() | |
input_data = re.sub(' +', ' ', input_data ).replace('\n', '').split(' ') | |
common_words_data = Path(f"{common_words}").read_text() | |
common_words_data = set(common_words_data.split('\n')) | |
number_of_common_words = {} | |
for text in input_data: | |
word = text.lower() | |
if word not in common_words_data: | |
if word in number_of_common_words: | |
number_of_common_words[word] += 1 | |
else: | |
number_of_common_words[word] = 1 | |
sorted_common_words = collections.OrderedDict(number_of_common_words).items() | |
print(f"{'Count':<6} {'Word':<6}") | |
print(f"{'===':<6} {'===':<6}") | |
for word, count in list(sorted_common_words)[:n]: | |
print(f"{count:<6} {word:<6}") | |
if __name__ == '__main__': | |
top_n_words('alice_in_wonderland.txt', '1-1000.txt', 5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment