Last active
June 9, 2020 22:06
-
-
Save rileylev/ef3a893651793f6a22283d3bb800db6e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from hash_map import HashMap | |
""" | |
This is the regular expression used to capture words. It could probably be endlessly | |
tweaked to catch more words, but this provides a standard we can test against, so don't | |
modify it for your assignment submission. | |
""" | |
rgx = re.compile("(\w[\w']*\w|\w)") | |
def hash_function_2(key): | |
""" | |
This is a hash function that can be used for the hashmap. | |
""" | |
var_hash = 0 | |
var_index = 0 | |
for i in key: | |
var_hash = var_hash + (var_index + 1) * ord(i) | |
var_index = var_index + 1 | |
return var_hash | |
def top_words(source, number): | |
""" | |
Takes a plain text file and counts the number of occurrences of case insensitive words. | |
Returns the top `number` of words in a list of tuples of the form (word, count). | |
Args: | |
source: the file name containing the text | |
number: the number of top results to return (e.g. 5 would return the 5 most common words) | |
Returns: | |
A list of tuples of the form (word, count), sorted by most common word. (e.g. [("a", 23), ("the", 20), ("it", 10)]) | |
""" | |
keys = set() | |
ht = HashMap(2500, hash_function_2) | |
# This block of code will read a file one word as a time and | |
# put the word in `w`. It should be left as starter code. | |
with open(source) as f: | |
for line in f: | |
words = rgx.findall(line) | |
for w in words: | |
# WRITTEN | |
w = w.lower() | |
if ht.contains_key(w): | |
assert(w in keys) | |
ht.put(w, ht.get(w)) | |
else: | |
keys.add(w) | |
ht.put(w, 1) | |
sorted_words = sorted(keys, reverse=True, | |
key= lambda word: ht.get(word)) | |
high_scores = sorted_words[:number] | |
print(top_words("alice.txt", 10)) # COMMENT THIS OUT WHEN SUBMITTING TO GRADESCOPE |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment