MarksCode · March 11, 2016 01:40
diff --git a/stuff.py b/stuff.py
 import string
 import numpy as np
 ######## Begin code which needs to be modified ##########

 # Implementation 1
 class MyChainDict(object):
 	def __init__(self):
 		self.dictSize = 1000
 		self.mydict = []
 		for i in range(self.dictSize):
 			self.mydict.append([])
 		self.itemsInserted = 0
 		return
 	
 	def insert(self, key, value=None):
 		if (self.itemsInserted > 0.5*self.dictSize):
 			self.rehash()
 		bucketNum = self.hashFunc(key)
 		doesContain = self.contains(key)
 		if not doesContain:
 			self.mydict[bucketNum].append([key, value])
 			self.itemsInserted = self.itemsInserted + 1
 			return
 		else: 
 			for x in self.mydict[bucketNum]:
 				if (key in x):
 					x[1] = value
 					return
 					
 	def rehash(self):
 		myList = []
 		for i in range(self.dictSize):
 			for y in self.mydict[i]:
 				myList.append(y)
 			self.mydict[i][:] = []
 		for y in range(self.dictSize):
 			self.mydict.append([])
 		self.itemsInserted = 0
 		self.dictSize = self.dictSize * 2
 		for x in myList:
 			self.insert(x[0], x[1])
 		

 	def contains(self, key):
 		bucketNum = self.hashFunc(key);
 		doesContain = False
 		for x in self.mydict[bucketNum]:
 			if (key in x):
 				doesContain = True
 		return (doesContain)

 	def value(self, key):
 		bucketNum = self.hashFunc(key);
 		doesContain = self.contains(key)
 		if not doesContain:
 			return False
 		else:
 			for x in self.mydict[bucketNum]:
 				if (key in x):
 					return x[1]
 			
 	def delete(self, key):
 		bucketNum = self.hashFunc(key)
 		index = 0
 		for x in self.mydict[bucketNum]:
 			if (key == x[0]):
 				self.mydict[bucketNum].pop(index)
 			else:
 				index = index + 1
 		return

 	def get_key_values(self):
 		myArray = []
 		for x in self.mydict:
 				for y in x:
 					myArray.append((y[0], y[1]))
 		return myArray
 	
 	def dump(self):
 		for i in self.mydict:
 			for x in i:
 				print (x)
 		
 			
 	def hashFunc(self, key):
 		return hash(key)%self.dictSize;
 			
 import numpy as np

 # Implementation 2
 class MyOpenLinearDict(object):
 	def __init__(self):
 		self.dictSize = 50000
 		self.mydict = np.empty((self.dictSize,2), dtype=object)
 		self.elementsInserted = 0
 		return
 	
 	def insert(self, key, value=None):
 		if (self.elementsInserted > 0.5*self.dictSize):
 			self.rehash()
 		bucketNum = self.hashFunc(key)
 		while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
 			bucketNum = bucketNum + 1
 			if (bucketNum > self.dictSize-1):
 				bucketNum = 0
 		self.mydict[bucketNum][0] = key
 		self.mydict[bucketNum][1] = value
 		self.elementsInserted = self.elementsInserted + 1
 	
 	def rehash(self):
 		myArray = np.empty((self.elementsInserted,2), dtype=object)
 		elems = 0
 		for i in range(self.dictSize):
 			if (self.mydict[i][0] is not None and self.mydict[i][0] != "Deletedq"):
 				myArray[elems][0] = self.mydict[i][0] 
 				myArray[elems][1] = self.mydict[i][1]
 				elems += 1
 		newArray = np.empty((self.dictSize*2,2), dtype=object)
 		self.dictSize = 2*self.dictSize
 		self.mydict = newArray
 		self.elementsInserted = 0
 		for y in range(elems):
 			self.insert(myArray[y][0], myArray[y][1])

 	def contains(self, key):
 		iterations = 0
 		bucketNum = self.hashFunc(key)
 		while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
 			bucketNum = bucketNum + 1
 			iterations = iterations + 1
 			if (bucketNum > self.dictSize-1):
 				bucketNum = 0
 		if (self.mydict[bucketNum][0] == key):
 			return True
 		else:
 			return False

 	def value(self, key):
 		bucketNum = self.hashFunc(key)
 		iterations = 0
 		while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
 			bucketNum = bucketNum + 1
 			iterations = iterations + 1
 			if (bucketNum > self.dictSize-1):
 				bucketNum = 0
 		if (self.mydict[bucketNum][0] == key):
 			return self.mydict[bucketNum][1]

 	def delete(self, key):
 		bucketNum = self.hashFunc(key)
 		while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
 			bucketNum = bucketNum + 1
 			if (bucketNum > self.dictSize-1):
 				bucketNum = 0
 		if (self.mydict[bucketNum][0] == key):
 			self.mydict[bucketNum] = ["Deletedq", -1]
 		return

 	def get_key_values(self):
 		myArray = []
 		for x in range(self.dictSize):
 			if (self.mydict[x][0] is not None and self.mydict[x][0] != "Deletedq"):
 				myArray.append((self.mydict[x][0], self.mydict[x][1]))
 		return myArray
 	
 	def dump(self):
 		for x in self.mydict:
 			if (x[0] is not None):
 				print (x[0], x[1])
 	
 	def hashFunc(self, key):
 		return hash(key)%self.dictSize
 	
 # Implementation 3
 class MyOpenQuadDict(object):
 	def __init__(self):
 		self.dictSize = 50000
 		self.mydict = np.empty((self.dictSize,2), dtype=object)
 		self.elementsInserted = 0
 		return
 		
 	def insert(self, key, value=None):
 		iteration = 0
 		if (self.elementsInserted > 0.5*self.dictSize):
 			self.rehash()
 		bucketNum = self.hashFunc(key, iteration)
 		while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
 			iteration = iteration + 1
 			bucketNum = self.hashFunc(key, iteration)
 		self.mydict[bucketNum][0] = key
 		self.mydict[bucketNum][1] = value
 		self.elementsInserted = self.elementsInserted + 1
 		
 	def rehash(self):
 		myArray = np.empty((self.elementsInserted,2), dtype=object)
 		elems = 0
 		for i in range(self.dictSize):
 			if (self.mydict[i][0] is not None and self.mydict[i][0] != "Deletedq"):
 				myArray[elems][0] = self.mydict[i][0] 
 				myArray[elems][1] = self.mydict[i][1]
 				elems += 1
 		newArray = np.empty((self.dictSize*2,2), dtype=object)
 		self.dictSize = 2*self.dictSize
 		self.mydict = newArray
 		self.elementsInserted = 0
 		for y in range(elems):
 			self.insert(myArray[y][0], myArray[y][1])

 	def contains(self, key):
 		iterations = 0
 		bucketNum = self.hashFunc(key, iterations)
 		while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
 			iterations = iterations + 1
 			bucketNum = self.hashFunc(key, iterations)
 		if (self.mydict[bucketNum][0] == key):
 			return True
 		else:
 			return False

 	def value(self, key):
 		iterations = 0
 		bucketNum = self.hashFunc(key, iterations)
 		while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
 			iterations = iterations + 1
 			bucketNum = self.hashFunc(key, iterations)
 		if (self.mydict[bucketNum][0] == key):
 			return self.mydict[bucketNum][1]

 	def delete(self, key):
 		iteration = 0
 		bucketNum = self.hashFunc(key, iteration)
 		while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
 			iteration = iteration + 1
 			bucketNum = self.hashFunc(key, iteration)
 		if (self.mydict[bucketNum][0] == key):
 			self.mydict[bucketNum][0] = "Deletedq"
 			self.mydict[bucketNum][1] = -1
 			self.elementsInserted = self.elementsInserted - 1
 		return

 	def get_key_values(self):
 		myArray = []
 		for x in range(self.dictSize):
 			if (self.mydict[x][0] is not None and self.mydict[x][0] != "Deletedq"):
 				myArray.append((self.mydict[x][0], self.mydict[x][1]))
 		return myArray
 		
 	def dump(self):
 		for x in self.mydict:
 			if (x[0] is not None):
 				print (x[0], int(x[1]))
 		
 	def hashFunc(self, key, iteration):
 		return (hash(key)%self.dictSize + 5*iteration + 5*iteration**2)%self.dictSize
 		
 			
 class MySet(object):
 	def __init__(self):
 		self.myset = MyChainDict()
 		return

 	def insert(self, key):
 		self.myset.insert(key)
 		return
 	
 	def contains(self, key):
 		return self.myset.contains(key)

 	def dump(self):
 		self.myset.dump()

 ######## End code which needs to be modified ##########


 # Store the set of stop words in a set object
 stop_words_file = "stopwords.txt"
 stop_words = MySet()

 with open(stop_words_file) as f:
 	for l in f:
 		stop_words.insert(l.strip())


 # Download file from https://snap.stanford.edu/data/finefoods.txt.gz        
 # Remember to gunzip before using
 review_file = "foods.txt"

 review_words = []
 for i in range(5):
 	review_words.append(MyChainDict())
 	
 with open(review_file, encoding='LATIN-1') as f:
 	lines = f.readlines()
 	idx = 0
 	num_lines = len(lines) - 2
 	while idx < num_lines:
 		while not lines[idx].startswith("review/score"):
 			idx = idx + 1

 		# Jump through hoops to satisfy the Unicode encodings 
 		l = lines[idx].encode('ascii', 'ignore').decode('ascii')
 		l = l.strip().split(":")[1].strip()

 		# Extract rating
 		rating = int(float(l))
 		while not lines[idx].startswith("review/text"):
 			idx = idx + 1
 			
 		l = lines[idx].encode('ascii', 'ignore').decode('ascii').strip().lower()
 		text = l.split(":")[1].strip()
 		text = text.translate(str.maketrans("","", string.punctuation))

 		# Extract words in the text 
 		words = text.split(" ")
 		# words = [x.strip(string.punctuation) for x in text.split(" ")]
 		for w in words:
 			if len(w) and not stop_words.contains(w):
 				if review_words[rating - 1].contains(w):
 					count = review_words[rating - 1].value(w) + 1
 				else:
 					count = 1
 				review_words[rating - 1].insert(w, count)

 # Print out the top words for each of the five ratings 

 for i in range(5):
 	freq_words = sorted(review_words[i].get_key_values(), key=lambda tup: tup[1], reverse=True)
 	print(i+1, freq_words[0:20])
	import string
	import numpy as np
	######## Begin code which needs to be modified ##########

	# Implementation 1
	class MyChainDict(object):
	def __init__(self):
	self.dictSize = 1000
	self.mydict = []
	for i in range(self.dictSize):
	self.mydict.append([])
	self.itemsInserted = 0
	return

	def insert(self, key, value=None):
	if (self.itemsInserted > 0.5*self.dictSize):
	self.rehash()
	bucketNum = self.hashFunc(key)
	doesContain = self.contains(key)
	if not doesContain:
	self.mydict[bucketNum].append([key, value])
	self.itemsInserted = self.itemsInserted + 1
	return
	else:
	for x in self.mydict[bucketNum]:
	if (key in x):
	x[1] = value
	return

	def rehash(self):
	myList = []
	for i in range(self.dictSize):
	for y in self.mydict[i]:
	myList.append(y)
	self.mydict[i][:] = []
	for y in range(self.dictSize):
	self.mydict.append([])
	self.itemsInserted = 0
	self.dictSize = self.dictSize * 2
	for x in myList:
	self.insert(x[0], x[1])


	def contains(self, key):
	bucketNum = self.hashFunc(key);
	doesContain = False
	for x in self.mydict[bucketNum]:
	if (key in x):
	doesContain = True
	return (doesContain)

	def value(self, key):
	bucketNum = self.hashFunc(key);
	doesContain = self.contains(key)
	if not doesContain:
	return False
	else:
	for x in self.mydict[bucketNum]:
	if (key in x):
	return x[1]

	def delete(self, key):
	bucketNum = self.hashFunc(key)
	index = 0
	for x in self.mydict[bucketNum]:
	if (key == x[0]):
	self.mydict[bucketNum].pop(index)
	else:
	index = index + 1
	return

	def get_key_values(self):
	myArray = []
	for x in self.mydict:
	for y in x:
	myArray.append((y[0], y[1]))
	return myArray

	def dump(self):
	for i in self.mydict:
	for x in i:
	print (x)


	def hashFunc(self, key):
	return hash(key)%self.dictSize;

	import numpy as np

	# Implementation 2
	class MyOpenLinearDict(object):
	def __init__(self):
	self.dictSize = 50000
	self.mydict = np.empty((self.dictSize,2), dtype=object)
	self.elementsInserted = 0
	return

	def insert(self, key, value=None):
	if (self.elementsInserted > 0.5*self.dictSize):
	self.rehash()
	bucketNum = self.hashFunc(key)
	while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
	bucketNum = bucketNum + 1
	if (bucketNum > self.dictSize-1):
	bucketNum = 0
	self.mydict[bucketNum][0] = key
	self.mydict[bucketNum][1] = value
	self.elementsInserted = self.elementsInserted + 1

	def rehash(self):
	myArray = np.empty((self.elementsInserted,2), dtype=object)
	elems = 0
	for i in range(self.dictSize):
	if (self.mydict[i][0] is not None and self.mydict[i][0] != "Deletedq"):
	myArray[elems][0] = self.mydict[i][0]
	myArray[elems][1] = self.mydict[i][1]
	elems += 1
	newArray = np.empty((self.dictSize*2,2), dtype=object)
	self.dictSize = 2*self.dictSize
	self.mydict = newArray
	self.elementsInserted = 0
	for y in range(elems):
	self.insert(myArray[y][0], myArray[y][1])

	def contains(self, key):
	iterations = 0
	bucketNum = self.hashFunc(key)
	while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
	bucketNum = bucketNum + 1
	iterations = iterations + 1
	if (bucketNum > self.dictSize-1):
	bucketNum = 0
	if (self.mydict[bucketNum][0] == key):
	return True
	else:
	return False

	def value(self, key):
	bucketNum = self.hashFunc(key)
	iterations = 0
	while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
	bucketNum = bucketNum + 1
	iterations = iterations + 1
	if (bucketNum > self.dictSize-1):
	bucketNum = 0
	if (self.mydict[bucketNum][0] == key):
	return self.mydict[bucketNum][1]

	def delete(self, key):
	bucketNum = self.hashFunc(key)
	while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
	bucketNum = bucketNum + 1
	if (bucketNum > self.dictSize-1):
	bucketNum = 0
	if (self.mydict[bucketNum][0] == key):
	self.mydict[bucketNum] = ["Deletedq", -1]
	return

	def get_key_values(self):
	myArray = []
	for x in range(self.dictSize):
	if (self.mydict[x][0] is not None and self.mydict[x][0] != "Deletedq"):
	myArray.append((self.mydict[x][0], self.mydict[x][1]))
	return myArray

	def dump(self):
	for x in self.mydict:
	if (x[0] is not None):
	print (x[0], x[1])

	def hashFunc(self, key):
	return hash(key)%self.dictSize

	# Implementation 3
	class MyOpenQuadDict(object):
	def __init__(self):
	self.dictSize = 50000
	self.mydict = np.empty((self.dictSize,2), dtype=object)
	self.elementsInserted = 0
	return

	def insert(self, key, value=None):
	iteration = 0
	if (self.elementsInserted > 0.5*self.dictSize):
	self.rehash()
	bucketNum = self.hashFunc(key, iteration)
	while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
	iteration = iteration + 1
	bucketNum = self.hashFunc(key, iteration)
	self.mydict[bucketNum][0] = key
	self.mydict[bucketNum][1] = value
	self.elementsInserted = self.elementsInserted + 1

	def rehash(self):
	myArray = np.empty((self.elementsInserted,2), dtype=object)
	elems = 0
	for i in range(self.dictSize):
	if (self.mydict[i][0] is not None and self.mydict[i][0] != "Deletedq"):
	myArray[elems][0] = self.mydict[i][0]
	myArray[elems][1] = self.mydict[i][1]
	elems += 1
	newArray = np.empty((self.dictSize*2,2), dtype=object)
	self.dictSize = 2*self.dictSize
	self.mydict = newArray
	self.elementsInserted = 0
	for y in range(elems):
	self.insert(myArray[y][0], myArray[y][1])

	def contains(self, key):
	iterations = 0
	bucketNum = self.hashFunc(key, iterations)
	while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
	iterations = iterations + 1
	bucketNum = self.hashFunc(key, iterations)
	if (self.mydict[bucketNum][0] == key):
	return True
	else:
	return False

	def value(self, key):
	iterations = 0
	bucketNum = self.hashFunc(key, iterations)
	while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
	iterations = iterations + 1
	bucketNum = self.hashFunc(key, iterations)
	if (self.mydict[bucketNum][0] == key):
	return self.mydict[bucketNum][1]

	def delete(self, key):
	iteration = 0
	bucketNum = self.hashFunc(key, iteration)
	while (self.mydict[bucketNum][0] != key and self.mydict[bucketNum][0] is not None):
	iteration = iteration + 1
	bucketNum = self.hashFunc(key, iteration)
	if (self.mydict[bucketNum][0] == key):
	self.mydict[bucketNum][0] = "Deletedq"
	self.mydict[bucketNum][1] = -1
	self.elementsInserted = self.elementsInserted - 1
	return

	def get_key_values(self):
	myArray = []
	for x in range(self.dictSize):
	if (self.mydict[x][0] is not None and self.mydict[x][0] != "Deletedq"):
	myArray.append((self.mydict[x][0], self.mydict[x][1]))
	return myArray

	def dump(self):
	for x in self.mydict:
	if (x[0] is not None):
	print (x[0], int(x[1]))

	def hashFunc(self, key, iteration):
	return (hash(key)%self.dictSize + 5iteration + 5iteration**2)%self.dictSize


	class MySet(object):
	def __init__(self):
	self.myset = MyChainDict()
	return

	def insert(self, key):
	self.myset.insert(key)
	return

	def contains(self, key):
	return self.myset.contains(key)

	def dump(self):
	self.myset.dump()

	######## End code which needs to be modified ##########


	# Store the set of stop words in a set object
	stop_words_file = "stopwords.txt"
	stop_words = MySet()

	with open(stop_words_file) as f:
	for l in f:
	stop_words.insert(l.strip())


	# Download file from https://snap.stanford.edu/data/finefoods.txt.gz
	# Remember to gunzip before using
	review_file = "foods.txt"

	review_words = []
	for i in range(5):
	review_words.append(MyChainDict())

	with open(review_file, encoding='LATIN-1') as f:
	lines = f.readlines()
	idx = 0
	num_lines = len(lines) - 2
	while idx < num_lines:
	while not lines[idx].startswith("review/score"):
	idx = idx + 1

	# Jump through hoops to satisfy the Unicode encodings
	l = lines[idx].encode('ascii', 'ignore').decode('ascii')
	l = l.strip().split(":")[1].strip()

	# Extract rating
	rating = int(float(l))
	while not lines[idx].startswith("review/text"):
	idx = idx + 1

	l = lines[idx].encode('ascii', 'ignore').decode('ascii').strip().lower()
	text = l.split(":")[1].strip()
	text = text.translate(str.maketrans("","", string.punctuation))

	# Extract words in the text
	words = text.split(" ")
	# words = [x.strip(string.punctuation) for x in text.split(" ")]
	for w in words:
	if len(w) and not stop_words.contains(w):
	if review_words[rating - 1].contains(w):
	count = review_words[rating - 1].value(w) + 1
	else:
	count = 1
	review_words[rating - 1].insert(w, count)

	# Print out the top words for each of the five ratings

	for i in range(5):
	freq_words = sorted(review_words[i].get_key_values(), key=lambda tup: tup[1], reverse=True)
	print(i+1, freq_words[0:20])