Skip to content

Instantly share code, notes, and snippets.

@chiangbing
Last active December 23, 2015 15:39
Show Gist options
  • Save chiangbing/6656619 to your computer and use it in GitHub Desktop.
Save chiangbing/6656619 to your computer and use it in GitHub Desktop.
A code snippet that solve Exercise 3.3.1(b) of the book *Mining of Massive Datasets*.
# A code snippet that solve Exercise 3.3.1(b) of *Mining of Massive Datasets*.
def permute(items):
"""Iterate all permutations of a list of items."""
length = len(items)
iternum = reduce(lambda x, y: x * y, range(1, length + 1))
for i in range(iternum):
digs = fac_base_digits(i)
digs = digs[len(digs) - length:]
items_copy = items[:]
yield tuple(items_copy.pop(d) for d in digs)
def combine(items, num):
"""Iterate all combinations."""
items = items[:]
if len(items) == num:
yield tuple(items)
elif num == 1:
for i in items:
yield tuple([i])
else:
while (len(items) >= num):
for c in combine(items[1:], num - 1):
yield tuple(items[0:1]) + c
items.pop(0)
DIVS = [5040, 720, 120, 24, 6, 2, 1, 1]
def fac_base_digits(n):
"""Get factorial base digits."""
digits = []
for div in DIVS:
digits.append(n / div)
n = n % div
return digits
def compute_hash(item_set):
cn = len(item_set[0])
rn = len(item_set)
hash_res = [-1] * cn
for c in range(cn):
for r in range(rn):
if item_set[r][c] == 1:
hash_res[c] = r
break
return hash_res
if __name__ == '__main__':
# a item set, rows are items, columns are transactions
item_set = [
# S1, S2, S3, S4
[1, 0, 0, 1], # a
[0, 0, 1, 0], # b
[0, 1, 0, 1], # c
[1, 0, 1, 1], # d
[0, 0, 1, 0] # e
]
# store hash equal count for each pair
eqls = {}
for s in permute(item_set):
hash_res = compute_hash(s)
for c in combine(range(len(hash_res)), 2):
eqls.setdefault(c, 0)
if hash_res[c[0]] == hash_res[c[1]]:
eqls[c] += 1
# print result
for pair, eql in eqls.iteritems():
print("SIM(S%d, S%d) = %.2f" % (pair[0] + 1, pair[1] + 1, eql / 120.0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment