Last active
December 23, 2015 15:39
-
-
Save chiangbing/6656619 to your computer and use it in GitHub Desktop.
A code snippet that solve Exercise 3.3.1(b) of the book *Mining of Massive Datasets*.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A code snippet that solve Exercise 3.3.1(b) of *Mining of Massive Datasets*. | |
def permute(items): | |
"""Iterate all permutations of a list of items.""" | |
length = len(items) | |
iternum = reduce(lambda x, y: x * y, range(1, length + 1)) | |
for i in range(iternum): | |
digs = fac_base_digits(i) | |
digs = digs[len(digs) - length:] | |
items_copy = items[:] | |
yield tuple(items_copy.pop(d) for d in digs) | |
def combine(items, num): | |
"""Iterate all combinations.""" | |
items = items[:] | |
if len(items) == num: | |
yield tuple(items) | |
elif num == 1: | |
for i in items: | |
yield tuple([i]) | |
else: | |
while (len(items) >= num): | |
for c in combine(items[1:], num - 1): | |
yield tuple(items[0:1]) + c | |
items.pop(0) | |
DIVS = [5040, 720, 120, 24, 6, 2, 1, 1] | |
def fac_base_digits(n): | |
"""Get factorial base digits.""" | |
digits = [] | |
for div in DIVS: | |
digits.append(n / div) | |
n = n % div | |
return digits | |
def compute_hash(item_set): | |
cn = len(item_set[0]) | |
rn = len(item_set) | |
hash_res = [-1] * cn | |
for c in range(cn): | |
for r in range(rn): | |
if item_set[r][c] == 1: | |
hash_res[c] = r | |
break | |
return hash_res | |
if __name__ == '__main__': | |
# a item set, rows are items, columns are transactions | |
item_set = [ | |
# S1, S2, S3, S4 | |
[1, 0, 0, 1], # a | |
[0, 0, 1, 0], # b | |
[0, 1, 0, 1], # c | |
[1, 0, 1, 1], # d | |
[0, 0, 1, 0] # e | |
] | |
# store hash equal count for each pair | |
eqls = {} | |
for s in permute(item_set): | |
hash_res = compute_hash(s) | |
for c in combine(range(len(hash_res)), 2): | |
eqls.setdefault(c, 0) | |
if hash_res[c[0]] == hash_res[c[1]]: | |
eqls[c] += 1 | |
# print result | |
for pair, eql in eqls.iteritems(): | |
print("SIM(S%d, S%d) = %.2f" % (pair[0] + 1, pair[1] + 1, eql / 120.0)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment