Skip to content

Instantly share code, notes, and snippets.

@cjdd3b
Last active August 29, 2015 14:09
Show Gist options
  • Save cjdd3b/1df202c1926787ac4af8 to your computer and use it in GitHub Desktop.
Save cjdd3b/1df202c1926787ac4af8 to your computer and use it in GitHub Desktop.
import random
class MinHasher(object):
def __init__(self, n, universe_size, seed=None):
if seed != None: random.seed(seed)
self.hash_functions = [self._create_random_hash_function(universe_size) for i in range(n)]
def _create_random_hash_function(self, universe_size):
a = random.randint(0, universe_size)
b = random.randint(0, universe_size)
return lambda x: (a * x + b) % universe_size
def generate_signature(self, s):
return [self.calculate_minhash(func, s) for func in self.hash_functions]
def calculate_minhash(self, hash_function, s):
minhash = float("inf")
for item in s:
value = hash_function(item)
if value < minhash:
minhash = value
return int(minhash)
def jaccard(s1, s2):
x = set(s1)
y = set(s2)
return float(len(x & y)) / len(x | y)
def similarity(s1, s2):
matches = 0
for i, h in enumerate(s1):
if h == s2[i]:
matches += 1
return matches / float(len(s1))
if __name__ == '__main__':
minhasher = MinHasher(10, 12549826247007890, 1234567)
u1 = range(1,10000)
u2 = range(5001,20000)
s1 = minhasher.generate_signature(u1)
s2 = minhasher.generate_signature(u2)
print 'J(u1, u2) = %s' % jaccard(u1, u2)
print 'sim(u1, u2) = %s' % similarity(s1, s2)
print s1
print s2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment