Created
March 11, 2016 01:40
-
-
Save MarksCode/322d032b6530e2563e26 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import string | |
import numpy as np | |
######## Begin code which needs to be modified ########## | |
# Implementation 1 | |
class MyChainDict(object): | |
def __init__(self): | |
self.dictSize = 1000 | |
self.mydict = [] | |
for i in range(self.dictSize): | |
self.mydict.append([]) | |
self.itemsInserted = 0 | |
return | |
def insert(self, key, value=None): | |
if (self.itemsInserted > 0.5*self.dictSize): | |
self.rehash() | |
bucketNum = self.hashFunc(key) | |
doesContain = self.contains(key) | |
if not doesContain: | |
self.mydict[bucketNum].append([key, value]) | |
self.itemsInserted = self.itemsInserted + 1 | |
return | |
else: | |
for x in self.mydict[bucketNum]: | |
if (key in x): | |
x[1] = value | |
return | |
def rehash(self): | |
myList = [] | |
for i in range(self.dictSize): | |
for y in self.mydict[i]: | |
myList.append(y) | |
self.mydict[i][:] = [] | |
for y in range(self.dictSize): | |
self.mydict.append([]) | |
self.itemsInserted = 0 | |
self.dictSize = self.dictSize * 2 | |
for x in myList: | |
self.insert(x[0], x[1]) | |
def contains(self, key): | |
bucketNum = self.hashFunc(key); | |
doesContain = False | |
for x in self.mydict[bucketNum]: | |
if (key in x): | |
doesContain = True | |
return (doesContain) | |
def value(self, key): | |
bucketNum = self.hashFunc(key); | |
doesContain = self.contains(key) | |
if not doesContain: | |
return False | |
else: | |
for x in self.mydict[bucketNum]: | |
if (key in x): | |
return x[1] | |
def delete(self, key): | |
bucketNum = self.hashFunc(key) | |
index = 0 | |
for x in self.mydict[bucketNum]: | |
if (key == x[0]): | |
self.mydict[bucketNum].pop(index) | |
else: | |
index = index + 1 | |
return | |
def get_key_values(self): | |
myArray = [] | |
for x in self.mydict: | |
for y in x: | |
myArray.append((y[0], y[1])) | |
return myArray | |
def dump(self): | |
for i in self.mydict: | |
for x in i: | |
print (x) | |
def hashFunc(self, key): | |
return hash(key)%self.dictSize; | |
import numpy as np | |
# Implementation 2 | |
class MyOpenLinearDict(object): | |
def __init__(self): | |
self.dictSize = 50000 | |
self.mydict = np.empty((self.dictSize,2), dtype=object) | |
self.elementsInserted = 0 | |
return | |
def insert(self, key, value=None): | |
if (self.elementsInserted > 0.5*self.dictSize): | |
self.rehash() | |
bucketNum = self.hashFunc(key) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None): | |
bucketNum = bucketNum + 1 | |
if (bucketNum > self.dictSize-1): | |
bucketNum = 0 | |
self.mydict[bucketNum][0] = key | |
self.mydict[bucketNum][1] = value | |
self.elementsInserted = self.elementsInserted + 1 | |
def rehash(self): | |
myArray = np.empty((self.elementsInserted,2), dtype=object) | |
elems = 0 | |
for i in range(self.dictSize): | |
if (self.mydict[i][0] is not None and self.mydict[i][0] != "Deletedq"): | |
myArray[elems][0] = self.mydict[i][0] | |
myArray[elems][1] = self.mydict[i][1] | |
elems += 1 | |
newArray = np.empty((self.dictSize*2,2), dtype=object) | |
self.dictSize = 2*self.dictSize | |
self.mydict = newArray | |
self.elementsInserted = 0 | |
for y in range(elems): | |
self.insert(myArray[y][0], myArray[y][1]) | |
def contains(self, key): | |
iterations = 0 | |
bucketNum = self.hashFunc(key) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None): | |
bucketNum = bucketNum + 1 | |
iterations = iterations + 1 | |
if (bucketNum > self.dictSize-1): | |
bucketNum = 0 | |
if (self.mydict[bucketNum][0] == key): | |
return True | |
else: | |
return False | |
def value(self, key): | |
bucketNum = self.hashFunc(key) | |
iterations = 0 | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None): | |
bucketNum = bucketNum + 1 | |
iterations = iterations + 1 | |
if (bucketNum > self.dictSize-1): | |
bucketNum = 0 | |
if (self.mydict[bucketNum][0] == key): | |
return self.mydict[bucketNum][1] | |
def delete(self, key): | |
bucketNum = self.hashFunc(key) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None): | |
bucketNum = bucketNum + 1 | |
if (bucketNum > self.dictSize-1): | |
bucketNum = 0 | |
if (self.mydict[bucketNum][0] == key): | |
self.mydict[bucketNum] = ["Deletedq", -1] | |
return | |
def get_key_values(self): | |
myArray = [] | |
for x in range(self.dictSize): | |
if (self.mydict[x][0] is not None and self.mydict[x][0] != "Deletedq"): | |
myArray.append((self.mydict[x][0], self.mydict[x][1])) | |
return myArray | |
def dump(self): | |
for x in self.mydict: | |
if (x[0] is not None): | |
print (x[0], x[1]) | |
def hashFunc(self, key): | |
return hash(key)%self.dictSize | |
# Implementation 3 | |
class MyOpenQuadDict(object): | |
def __init__(self): | |
self.dictSize = 50000 | |
self.mydict = np.empty((self.dictSize,2), dtype=object) | |
self.elementsInserted = 0 | |
return | |
def insert(self, key, value=None): | |
iteration = 0 | |
if (self.elementsInserted > 0.5*self.dictSize): | |
self.rehash() | |
bucketNum = self.hashFunc(key, iteration) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None): | |
iteration = iteration + 1 | |
bucketNum = self.hashFunc(key, iteration) | |
self.mydict[bucketNum][0] = key | |
self.mydict[bucketNum][1] = value | |
self.elementsInserted = self.elementsInserted + 1 | |
def rehash(self): | |
myArray = np.empty((self.elementsInserted,2), dtype=object) | |
elems = 0 | |
for i in range(self.dictSize): | |
if (self.mydict[i][0] is not None and self.mydict[i][0] != "Deletedq"): | |
myArray[elems][0] = self.mydict[i][0] | |
myArray[elems][1] = self.mydict[i][1] | |
elems += 1 | |
newArray = np.empty((self.dictSize*2,2), dtype=object) | |
self.dictSize = 2*self.dictSize | |
self.mydict = newArray | |
self.elementsInserted = 0 | |
for y in range(elems): | |
self.insert(myArray[y][0], myArray[y][1]) | |
def contains(self, key): | |
iterations = 0 | |
bucketNum = self.hashFunc(key, iterations) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None): | |
iterations = iterations + 1 | |
bucketNum = self.hashFunc(key, iterations) | |
if (self.mydict[bucketNum][0] == key): | |
return True | |
else: | |
return False | |
def value(self, key): | |
iterations = 0 | |
bucketNum = self.hashFunc(key, iterations) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None): | |
iterations = iterations + 1 | |
bucketNum = self.hashFunc(key, iterations) | |
if (self.mydict[bucketNum][0] == key): | |
return self.mydict[bucketNum][1] | |
def delete(self, key): | |
iteration = 0 | |
bucketNum = self.hashFunc(key, iteration) | |
while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None): | |
iteration = iteration + 1 | |
bucketNum = self.hashFunc(key, iteration) | |
if (self.mydict[bucketNum][0] == key): | |
self.mydict[bucketNum][0] = "Deletedq" | |
self.mydict[bucketNum][1] = -1 | |
self.elementsInserted = self.elementsInserted - 1 | |
return | |
def get_key_values(self): | |
myArray = [] | |
for x in range(self.dictSize): | |
if (self.mydict[x][0] is not None and self.mydict[x][0] != "Deletedq"): | |
myArray.append((self.mydict[x][0], self.mydict[x][1])) | |
return myArray | |
def dump(self): | |
for x in self.mydict: | |
if (x[0] is not None): | |
print (x[0], int(x[1])) | |
def hashFunc(self, key, iteration): | |
return (hash(key)%self.dictSize + 5*iteration + 5*iteration**2)%self.dictSize | |
class MySet(object): | |
def __init__(self): | |
self.myset = MyChainDict() | |
return | |
def insert(self, key): | |
self.myset.insert(key) | |
return | |
def contains(self, key): | |
return self.myset.contains(key) | |
def dump(self): | |
self.myset.dump() | |
######## End code which needs to be modified ########## | |
# Store the set of stop words in a set object | |
stop_words_file = "stopwords.txt" | |
stop_words = MySet() | |
with open(stop_words_file) as f: | |
for l in f: | |
stop_words.insert(l.strip()) | |
# Download file from https://snap.stanford.edu/data/finefoods.txt.gz | |
# Remember to gunzip before using | |
review_file = "foods.txt" | |
review_words = [] | |
for i in range(5): | |
review_words.append(MyChainDict()) | |
with open(review_file, encoding='LATIN-1') as f: | |
lines = f.readlines() | |
idx = 0 | |
num_lines = len(lines) - 2 | |
while idx < num_lines: | |
while not lines[idx].startswith("review/score"): | |
idx = idx + 1 | |
# Jump through hoops to satisfy the Unicode encodings | |
l = lines[idx].encode('ascii', 'ignore').decode('ascii') | |
l = l.strip().split(":")[1].strip() | |
# Extract rating | |
rating = int(float(l)) | |
while not lines[idx].startswith("review/text"): | |
idx = idx + 1 | |
l = lines[idx].encode('ascii', 'ignore').decode('ascii').strip().lower() | |
text = l.split(":")[1].strip() | |
text = text.translate(str.maketrans("","", string.punctuation)) | |
# Extract words in the text | |
words = text.split(" ") | |
# words = [x.strip(string.punctuation) for x in text.split(" ")] | |
for w in words: | |
if len(w) and not stop_words.contains(w): | |
if review_words[rating - 1].contains(w): | |
count = review_words[rating - 1].value(w) + 1 | |
else: | |
count = 1 | |
review_words[rating - 1].insert(w, count) | |
# Print out the top words for each of the five ratings | |
for i in range(5): | |
freq_words = sorted(review_words[i].get_key_values(), key=lambda tup: tup[1], reverse=True) | |
print(i+1, freq_words[0:20]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment