Created
March 8, 2016 06:07
-
-
Save MarksCode/ce2da23d834e35655a20 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
######## Begin code which needs to be modified ########## | |
# Implementation 1 | |
class MyChainDict(object): | |
def __init__(self): | |
self.dictSize = 10000 | |
self.mydict = [] | |
for i in range(self.dictSize): | |
self.mydict.append([]) | |
return | |
def insert(self, key, value=None): | |
bucketNum = self.hashFunc(key) | |
doesContain = self.contains(key) | |
if not doesContain: | |
self.mydict[bucketNum].append([key, value]) | |
return | |
for x in self.mydict[bucketNum]: | |
if (key in x): | |
x[1] = value | |
return | |
def contains(self, key): | |
bucketNum = self.hashFunc(key); | |
doesContain = False | |
for x in self.mydict[bucketNum]: | |
if (key in x): | |
doesContain = True | |
return (doesContain) | |
def value(self, key): | |
bucketNum = self.hashFunc(key); | |
doesContain = self.contains(key) | |
if not doesContain: | |
return False | |
else: | |
for x in self.mydict[bucketNum]: | |
if (key in x): | |
return x[1] | |
def delete(self, key): | |
bucketNum = self.hashFunc(key) | |
index = 0 | |
for x in self.mydict[bucketNum]: | |
if (key == x[0]): | |
self.mydict[bucketNum].pop(index) | |
else: | |
index = index + 1 | |
return | |
def get_key_values(self): | |
myArray = [] | |
for x in self.mydict: | |
for y in x: | |
myArray.append((y[0], y[1])) | |
return myArray | |
def dump(self): | |
for i in self.mydict: | |
for x in i: | |
print (x) | |
def hashFunc(self, key): | |
return hash(key)%self.dictSize; | |
# Implementation 2 | |
class MyOpenLinearDict(object): | |
def __init__(self): | |
self.dictSize = 500000 | |
self.mydict = [] | |
for i in range(self.dictSize): | |
self.mydict.append(["q", -1]) | |
return | |
def insert(self, key, value=None): | |
iterations = 0 | |
bucketNum = self.hashFunc(key) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q" and iterations <= self.dictSize-1): | |
bucketNum = bucketNum + 1 | |
iterations = iterations + 1 | |
if (bucketNum > self.dictSize-1): | |
bucketNum = 0 | |
self.mydict[bucketNum] = [key, value] | |
def contains(self, key): | |
iterations = 0 | |
bucketNum = self.hashFunc(key) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q" and iterations <= self.dictSize-1): | |
bucketNum = bucketNum + 1 | |
iterations = iterations + 1 | |
if (bucketNum > self.dictSize-1): | |
bucketNum = 0 | |
if (self.mydict[bucketNum][0] == key): | |
return True | |
else: | |
return False | |
def value(self, key): | |
bucketNum = self.hashFunc(key) | |
iterations = 0 | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q" and iterations <= self.dictSize-1): | |
bucketNum = bucketNum + 1 | |
iterations = iterations + 1 | |
if (bucketNum > self.dictSize-1): | |
bucketNum = 0 | |
if (self.mydict[bucketNum][0] == key): | |
return self.mydict[bucketNum][1] | |
def delete(self, key): | |
bucketNum = self.hashFunc(key) | |
iterations = 0 | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q" and iterations <= self.dictSize-1): | |
bucketNum = bucketNum + 1 | |
iterations = iterations + 1 | |
if (bucketNum > self.dictSize-1): | |
bucketNum = 0 | |
if (self.mydict[bucketNum][0] == key): | |
self.mydict[bucketNum] = ["Deleted", -1] | |
return | |
def get_key_values(self): | |
myArray = [] | |
for x in self.mydict: | |
if (x[0] != "q"): | |
myArray.append((x[0], x[1])) | |
return myArray | |
def dump(self): | |
for x in self.mydict: | |
print (x[0], x[1]) | |
def hashFunc(self, key): | |
return hash(key)%self.dictSize | |
# Implementation 3 | |
class MyOpenQuadDict(object): | |
def __init__(self): | |
self.dictSize = 500000 | |
self.mydict = [] | |
for i in range(self.dictSize): | |
self.mydict.append(["q", -1]) | |
return | |
def insert(self, key, value=None): | |
iterations = 0 | |
bucketNum = self.hashFunc(key, iterations) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q"): | |
iterations = iterations + 1 | |
bucketNum = self.hashFunc(key, iterations) | |
self.mydict[bucketNum] = [key, value] | |
def contains(self, key): | |
iterations = 0 | |
bucketNum = self.hashFunc(key, iterations) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q"): | |
iterations = iterations + 1 | |
bucketNum = self.hashFunc(key, iterations) | |
if (self.mydict[bucketNum][0] == key): | |
return True | |
else: | |
return False | |
def value(self, key): | |
iterations = 0 | |
bucketNum = self.hashFunc(key, iterations) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q"): | |
iterations = iterations + 1 | |
bucketNum = self.hashFunc(key, iterations) | |
if (self.mydict[bucketNum][0] == key): | |
return self.mydict[bucketNum][1] | |
def delete(self, key): | |
iterations = 0 | |
bucketNum = self.hashFunc(key, iterations) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] != "q"): | |
iterations = iterations + 1 | |
bucketNum = self.hashFunc(key, iterations) | |
if (self.mydict[bucketNum][0] == key): | |
self.mydict[bucketNum] = ["Deleted", -1] | |
return | |
def get_key_values(self): | |
myArray = [] | |
for x in self.mydict: | |
if (x[0] != "q"): | |
myArray.append((x[0], x[1])) | |
return myArray | |
def dump(self): | |
for x in self.mydict: | |
if (x[0] != "q"): | |
print ((x[0], x[1])) | |
def hashFunc(self, key, iteration): | |
return (hash(key)+10*iteration+10*iteration^2)%self.dictSize | |
class MySet(object): | |
def __init__(self): | |
self.myset = MyOpenQuadDict() | |
return | |
def insert(self, key): | |
self.myset.insert(key) | |
return | |
def contains(self, key): | |
return self.myset.contains(key) | |
def dump(self): | |
self.myset.dump() | |
######## End code which needs to be modified ########## | |
# Store the set of stop words in a set object | |
stop_words_file = "stopwords.txt" | |
stop_words = MySet() | |
with open(stop_words_file) as f: | |
for l in f: | |
stop_words.insert(l.strip()) | |
# Download file from https://snap.stanford.edu/data/finefoods.txt.gz | |
# Remember to gunzip before using | |
review_file = "foods.txt" | |
review_words = [] | |
for i in range(5): | |
review_words.append(MyOpenQuadDict()) | |
with open(review_file, encoding='LATIN-1') as f: | |
lines = f.readlines() | |
idx = 0 | |
num_lines = len(lines) - 2 | |
while idx < num_lines: | |
while not lines[idx].startswith("review/score"): | |
idx = idx + 1 | |
# Jump through hoops to satisfy the Unicode encodings | |
l = lines[idx].encode('ascii', 'ignore').decode('ascii') | |
l = l.strip().split(":")[1].strip() | |
# Extract rating | |
rating = int(float(l)) | |
while not lines[idx].startswith("review/text"): | |
idx = idx + 1 | |
l = lines[idx].encode('ascii', 'ignore').decode('ascii').strip().lower() | |
text = l.split(":")[1].strip() | |
text = text.translate(str.maketrans("","", string.punctuation)) | |
# Extract words in the text | |
words = text.split(" ") | |
# words = [x.strip(string.punctuation) for x in text.split(" ")] | |
for w in words: | |
if len(w) and not stop_words.contains(w): | |
if review_words[rating - 1].contains(w): | |
count = review_words[rating - 1].value(w) + 1 | |
else: | |
count = 1 | |
review_words[rating - 1].insert(w, count) | |
# Print out the top words for each of the five ratings | |
for i in range(5): | |
freq_words = sorted(review_words[i].get_key_values(), key=lambda tup: tup[1], reverse=True) | |
print(i+1, freq_words[0:20]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment