Last active
December 14, 2015 01:19
-
-
Save mynameisfiber/5005535 to your computer and use it in GitHub Desktop.
This is a quick kmaxhash implemintation to test out the usefulness of this datastructure for calculating jaccard metrics. It is quick because I wrote it quickly :)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python2.7 | |
# | |
# This is a quick kmaxhash implemintation to test out the usefulness of this | |
# datastructure for calculating jaccard metrics. It is quick because I wrote | |
# it quickly :) | |
# micha gorelick, [email protected] | |
# | |
import mmh3 | |
import heapq | |
from itertools import (chain, ifilterfalse) | |
MAX_32BIT_INT = 2**31 - 1 | |
def unique_everseen(iterable, key=None): | |
"List unique elements, preserving order. Remember all elements ever seen." | |
# unique_everseen('AAAABBBCCDAABBB') --> A B C D | |
# unique_everseen('ABBCcAD', str.lower) --> A B C D | |
seen = set() | |
seen_add = seen.add | |
if key is None: | |
for element in ifilterfalse(seen.__contains__, iterable): | |
seen_add(element) | |
yield element | |
else: | |
for element in iterable: | |
k = key(element) | |
if k not in seen: | |
seen_add(k) | |
yield element | |
class QuickKMaxHash: | |
def __init__(self, items=[], k=20, hasher=mmh3.hash): | |
self.kmin = [] | |
self.k = k | |
self.hasher = hasher | |
self.update(items) | |
def _idx(self, key): | |
return self.hasher(str(key)) | |
def add(self, key): | |
idx = self._idx(key) | |
if len(self.kmin) < self.k: | |
heapq.heappush(self.kmin, idx) | |
else: | |
if idx > self.kmin[0]: | |
heapq.heappushpop(self.kmin, idx) | |
def update(self, keys): | |
for key in keys: | |
self.add(key) | |
def _aux_union(self, other): | |
X = set(chain(self.kmin, other.kmin)) | |
n = 0 | |
small, large_list = sorted((self.kmin, other.kmin), key=len) | |
large_set = set(large_list) | |
for item in small: | |
if item in X and item in large_set: | |
n += 1 | |
return n, X | |
def union(self, other, newk=0): | |
newk = newk or min(self.k, other.k) | |
self.kmin = heapq.nlargest(newk, unique_everseen(chain(self.kmin, other.kmin))) | |
heapq.heapify(self.kmin) | |
def jaccard(self, other, k=0): | |
k = min(self.k, other.k) | |
n, _ = self._aux_union(other) | |
return n / (1.0 * k) | |
def cardinality_intersection(self, other, k=0): | |
k = min(self.k, other.k) | |
n, X = self._aux_union(other) | |
cardX = self._cardhelp(min(X), len(X)) | |
return n / (1.0 * k) * cardX | |
def cardinality_union_(self, other, k=0): | |
_, X = self._aux_union(other) | |
cardX = self._cardhelp(min(X), len(X)) | |
return cardX | |
def _cardhelp(self, kmin, k): | |
return 2.0 * (k - 1.0) * MAX_32BIT_INT / (MAX_32BIT_INT - kmin) | |
def cardinality(self): | |
if len(self.kmin) < self.k: | |
return len(self.kmin) | |
return self._cardhelp(self.kmin[0], self.k) | |
def __len__(self): | |
return int(self.cardinality()) | |
def __add__(self, other): | |
assert other.k == self.k | |
nt = QuickKMaxHash(k = self.k) | |
nt.kmin = self.kmin | |
nt.union(other) | |
return nt | |
def test_constructor(): | |
t1 = QuickKMaxHash(range(50)) | |
t2 = QuickKMaxHash(range(50), k=50) | |
t1 = QuickKMaxHash(k=10) | |
t1 = QuickKMaxHash() | |
def test_add(): | |
t1 = QuickKMaxHash(k=1) | |
t1.add("TEST1") | |
assert t1.kmin != [] | |
assert len(t1.kmin) == 1 | |
t1.add("TEST2") | |
assert t1.kmin != [] | |
assert len(t1.kmin) == 1 | |
def test_update(): | |
t1 = QuickKMaxHash(k=15) | |
t1.update(range(10)) | |
assert len(t1.kmin) == 10 | |
def test_jaccard(): | |
# k == 400 => relative error of ~0.05 | |
t1 = QuickKMaxHash(range(500), k=400) | |
t2 = QuickKMaxHash(range(100, 500), k=400) | |
j_kmin = t1.jaccard(t2) | |
j_real = 4. / 5. | |
error = 1 / (t1.k)**0.5 | |
assert abs(1 - j_kmin / j_real) <= error | |
def test_union(): | |
t1 = QuickKMaxHash(range(10), k=5) | |
t2 = QuickKMaxHash(range(10), k=5) | |
t3 = QuickKMaxHash(range(20), k=5) | |
t1.union(t2) | |
assert set(t1.kmin) == set(t2.kmin) | |
t1.union(t3) | |
assert set(t1.kmin) == set(t3.kmin) | |
t4 = QuickKMaxHash(range(40,50), k=5) | |
t5 = t4 + t1 | |
assert set(t1.kmin) != set(t5.kmin) | |
assert set(t4.kmin) != set(t5.kmin) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment