Created
March 5, 2020 06:49
-
-
Save 270ajay/54f54725a9e78a74aa29515de2be4961 to your computer and use it in GitHub Desktop.
hash tables (how python dict and set works)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import random | |
'''Course: https://www.coursera.org/learn/data-structures#about''' | |
def isPrime(num): | |
'''src: https://codereview.stackexchange.com/questions/71212/find-smallest-prime-number-greater-than-given-n''' | |
if num%2 == 0 or num%3 == 0: | |
return False | |
rootNum = int(num ** 0.5) | |
for i in range(5, rootNum+1, 6): | |
if num%i == 0: | |
return False | |
for i in range(7, rootNum+1, 6): | |
if num%i == 0: | |
return False | |
return True | |
def getPrimeGreaterThan(num): | |
num+=1 | |
while True: | |
if isPrime(num): | |
return num | |
num+=1 | |
class HashMap: | |
'''dict in python; HashMap in java; unordered_map in c++''' | |
loadFactor = 0.9 | |
defaultMaxIntegerKey = int(1e+10) | |
def __init__(self, maxIntegerKey=None): | |
self.size = 2 | |
self.array = [] | |
for i in range(self.size): | |
self.array.append([]) | |
self.maxIntegerKey = maxIntegerKey if maxIntegerKey != None else HashMap.defaultMaxIntegerKey | |
self.primeNum = getPrimeGreaterThan(self.maxIntegerKey) | |
self.hashFuncCoeffA = random.randrange(1, self.primeNum - 1) | |
self.hasFuncCoeffB = random.randrange(0, self.primeNum - 1) | |
self.numOfKeys = 0 | |
def polyHash(self, stringObj): | |
hashValue = 0 | |
for i in range(len(stringObj) - 1, -1, -1): | |
hashValue = (((hashValue * self.hashFuncCoeffA) + ord(stringObj[i])) % self.primeNum) | |
return hashValue | |
def hashFunction(self, obj): | |
if isinstance(obj, int): | |
return ((((self.hashFuncCoeffA * obj) + self.hasFuncCoeffB) % self.primeNum) % self.size) | |
elif isinstance(obj, str): | |
hashValueOfStr = self.polyHash(obj) | |
return ((((self.hashFuncCoeffA * hashValueOfStr) + self.hasFuncCoeffB) % self.primeNum) % self.size) | |
else: | |
raise Exception("Only takes integer or string as keys") | |
def hasKey(self, obj): | |
array = self.array[self.hashFunction(obj)] | |
for objValueTuple in array: | |
if objValueTuple[0] == obj: | |
return True | |
return False | |
def get(self, obj): | |
array = self.array[self.hashFunction(obj)] | |
for objValueList in array: | |
if objValueList[0] == obj: | |
return objValueList[1] | |
raise Exception("Object not present") | |
def set(self, obj, value): | |
if self.numOfKeys/self.size > HashMap.loadFactor: | |
self.rehash() | |
array = self.array[self.hashFunction(obj)] | |
for objValueList in array: | |
if objValueList[0] == obj: | |
objValueList[1] = value | |
return | |
array.append([obj, value]) | |
self.numOfKeys += 1 | |
def rehash(self): | |
new2DArray = [] | |
for i in range(self.size * 2): | |
new2DArray.append([]) | |
self.getNewRandomHashFuncCoeff() | |
self.size *= 2 | |
for array in self.array: | |
if array: | |
for objValueList in array: | |
newArray = new2DArray[self.hashFunction(objValueList[0])] | |
newArray.append(objValueList) | |
self.array = new2DArray | |
def getNewRandomHashFuncCoeff(self): | |
self.hashFuncCoeffA = random.randrange(1, self.primeNum - 1) | |
self.hasFuncCoeffB = random.randrange(0, self.primeNum - 1) | |
def __str__(self): | |
string = "{" | |
for array in self.array: | |
if array: | |
for objValueList in array: | |
string += " "+ str(objValueList[0]) + ": "+ str(objValueList[1])+"," | |
string+= ' }' | |
return string | |
class HashSet: | |
'''set in python, HashSet in java, unordered_set in c++''' | |
loadFactor = 0.9 | |
defaultMaxIntegerKey = int(1e+10) | |
def __init__(self, maxIntegerKey=None): | |
self.size = 2 | |
self.array = [] | |
for i in range(self.size): | |
self.array.append([]) | |
self.maxIntegerKey = maxIntegerKey if maxIntegerKey != None else HashSet.defaultMaxIntegerKey | |
self.primeNum = getPrimeGreaterThan(self.maxIntegerKey) | |
self.hashFuncCoeffA = random.randrange(1, self.primeNum - 1) | |
self.hasFuncCoeffB = random.randrange(0, self.primeNum - 1) | |
self.numOfObjects = 0 | |
def polyHash(self, stringObj): | |
hashValue = 0 | |
for i in range(len(stringObj) - 1, -1, -1): | |
hashValue = (((hashValue * self.hashFuncCoeffA) + ord(stringObj[i])) % self.primeNum) | |
return hashValue | |
def hashFunction(self, obj): | |
if isinstance(obj, int): | |
return ((((self.hashFuncCoeffA * obj) + self.hasFuncCoeffB) % self.primeNum) % self.size) | |
elif isinstance(obj, str): | |
hashValueOfStr = self.polyHash(obj) | |
return ((((self.hashFuncCoeffA * hashValueOfStr) + self.hasFuncCoeffB) % self.primeNum) % self.size) | |
else: | |
raise Exception("Only takes integer or string as keys") | |
def find(self, obj): | |
array = self.array[self.hashFunction(obj)] | |
for obj1 in array: | |
if obj1 == obj: | |
return True | |
return False | |
def remove(self, obj): | |
if not self.find(obj): | |
return | |
array = self.array[self.hashFunction(obj)] | |
array.remove(obj) | |
def add(self, obj): | |
if self.numOfObjects/self.size > HashSet.loadFactor: | |
self.rehash() | |
array = self.array[self.hashFunction(obj)] | |
for obj2 in array: | |
if obj2 == obj: | |
return | |
array.append(obj) | |
self.numOfObjects += 1 | |
def rehash(self): | |
new2DArray = [] | |
for i in range(self.size * 2): | |
new2DArray.append([]) | |
self.getNewRandomHashFuncCoeff() | |
self.size *= 2 | |
for array in self.array: | |
if array: | |
for obj in array: | |
newArray = new2DArray[self.hashFunction(obj)] | |
newArray.append(obj) | |
self.array = new2DArray | |
def getNewRandomHashFuncCoeff(self): | |
self.hashFuncCoeffA = random.randrange(1, self.primeNum - 1) | |
self.hasFuncCoeffB = random.randrange(0, self.primeNum - 1) | |
def __str__(self): | |
string = "{" | |
for array in self.array: | |
if array: | |
for obj in array: | |
string += " "+ str(obj) +"," | |
string+= ' }' | |
return string | |
def areEqualStrings(string1, string2): | |
lengthOfString1 = len(string1) | |
if lengthOfString1 != len(string2): | |
return False | |
for i in range(lengthOfString1): | |
if string1[i] != string2[i]: | |
return False | |
return True | |
def findPatternNaive(text, pattern): | |
'''returns the index/indices of the pattern in text''' | |
result = [] | |
lengthOfPattern = len(pattern) | |
for i in range(len(text) - lengthOfPattern + 1): | |
if areEqualStrings(text[i:i+lengthOfPattern], pattern): | |
result.append(i) | |
return result | |
def polyHash(string, polyHashCoeff, primeNum): | |
'''returns the hash value of string''' | |
hashValue = 0 | |
for i in range(len(string) - 1, -1, -1): | |
hashValue = (((hashValue * polyHashCoeff) + ord(string[i])) % primeNum) | |
return hashValue | |
def rabinKarp(text, pattern): | |
'''returns the index/indices of the pattern in text. It is not faster than findPatternNaive''' | |
lengthOfPattern = len(pattern) | |
lengthOfText = len(text) | |
primeNum = getPrimeGreaterThan(10 * lengthOfText * lengthOfPattern) | |
polyHashCoeff = random.randrange(1, primeNum - 1) | |
result = [] | |
patternHash = polyHash(pattern, polyHashCoeff, primeNum) | |
for i in range(lengthOfText - lengthOfPattern + 1): | |
textHash = polyHash(text[i:i+lengthOfPattern], polyHashCoeff, primeNum) | |
if patternHash != textHash: | |
continue | |
if areEqualStrings(text[i:i+lengthOfPattern], pattern): | |
result.append(i) | |
return result | |
def precomputeHashes(text, lengthOfPattern, polyHashCoeff, primeNum): | |
'''this function is used in rabinKarpImproved function below''' | |
lengthOfText = len(text) | |
hashArray = [None] * (lengthOfText - lengthOfPattern + 1) | |
lastSubStringOfText = text[lengthOfText - lengthOfPattern : lengthOfText] | |
hashArray[lengthOfText - lengthOfPattern] = polyHash(lastSubStringOfText, polyHashCoeff, primeNum) | |
y = 1 | |
for _ in range(lengthOfPattern): | |
y = (y * polyHashCoeff) % primeNum | |
for i in range(lengthOfText - lengthOfPattern - 1, -1, -1): | |
hashArray[i] = ((polyHashCoeff * hashArray[i + 1]) + ord(text[i]) - (y * ord(text[i+lengthOfPattern]))) % primeNum | |
return hashArray | |
def rabinKarpImproved(text, pattern): | |
'''returns the index/indices of the pattern in text. It is faster than findPatternNaive''' | |
lengthOfPattern = len(pattern) | |
lengthOfText = len(text) | |
primeNum = getPrimeGreaterThan(10 * lengthOfText * lengthOfPattern) | |
polyHashCoeff = random.randrange(1, primeNum - 1) | |
result = [] | |
patternHash = polyHash(pattern, polyHashCoeff, primeNum) | |
hashArray = precomputeHashes(text, lengthOfPattern, polyHashCoeff, primeNum) | |
for i in range(lengthOfText - lengthOfPattern + 1): | |
if patternHash != hashArray[i]: | |
continue | |
if areEqualStrings(text[i:i+lengthOfPattern], pattern): | |
result.append(i) | |
return result | |
if __name__ == '__main__': | |
hashMap = HashMap(maxIntegerKey=100) | |
hashMap.set(10, 120) | |
hashMap.set(2, 30) | |
hashMap.set(14, 300) | |
hashMap.set(25, 10) | |
hashMap.set('ajay', 10) | |
print(hashMap) | |
print(hashMap.get('ajay')) | |
hashSet = HashSet(maxIntegerKey=100) | |
hashSet.add('ajay') | |
hashSet.add('ajay') | |
hashSet.add(1) | |
hashSet.add(3) | |
print(hashSet) | |
print("\nNaive approach: thisisanexampletocheck contains word example from indices:", findPatternNaive('thisisanexampletocheck', 'example')) | |
print("Rabin karp algorithm: thisisanexampletocheck contains word example from indices:", rabinKarp('thisisanexampletocheck', 'example')) | |
print("Rabin karp improved: thisisanexampletocheck contains word example from indices:", rabinKarpImproved('thisisanexampletocheck', 'example')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment