-
-
Save marcelcaraciolo/1423287 to your computer and use it in GitHub Desktop.
| #-*- coding:utf-8 - *- | |
| def load_dataset(): | |
| "Load the sample dataset." | |
| return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] | |
| def createC1(dataset): | |
| "Create a list of candidate item sets of size one." | |
| c1 = [] | |
| for transaction in dataset: | |
| for item in transaction: | |
| if not [item] in c1: | |
| c1.append([item]) | |
| c1.sort() | |
| #frozenset because it will be a ket of a dictionary. | |
| return map(frozenset, c1) | |
| def scanD(dataset, candidates, min_support): | |
| "Returns all candidates that meets a minimum support level" | |
| sscnt = {} | |
| for tid in dataset: | |
| for can in candidates: | |
| if can.issubset(tid): | |
| sscnt.setdefault(can, 0) | |
| sscnt[can] += 1 | |
| num_items = float(len(dataset)) | |
| retlist = [] | |
| support_data = {} | |
| for key in sscnt: | |
| support = sscnt[key] / num_items | |
| if support >= min_support: | |
| retlist.insert(0, key) | |
| support_data[key] = support | |
| return retlist, support_data | |
| def aprioriGen(freq_sets, k): | |
| "Generate the joint transactions from candidate sets" | |
| retList = [] | |
| lenLk = len(freq_sets) | |
| for i in range(lenLk): | |
| for j in range(i + 1, lenLk): | |
| L1 = list(freq_sets[i])[:k - 2] | |
| L2 = list(freq_sets[j])[:k - 2] | |
| L1.sort() | |
| L2.sort() | |
| if L1 == L2: | |
| retList.append(freq_sets[i] | freq_sets[j]) | |
| return retList | |
| def apriori(dataset, minsupport=0.5): | |
| "Generate a list of candidate item sets" | |
| C1 = createC1(dataset) | |
| D = map(set, dataset) | |
| L1, support_data = scanD(D, C1, minsupport) | |
| L = [L1] | |
| k = 2 | |
| while (len(L[k - 2]) > 0): | |
| Ck = aprioriGen(L[k - 2], k) | |
| Lk, supK = scanD(D, Ck, minsupport) | |
| support_data.update(supK) | |
| L.append(Lk) | |
| k += 1 | |
| return L, support_data |
sccnt = defaultdict(int), which yields 0, or default(lambda : 0), would be a little cleaner and more idiomatic.
Overall though, awesome article and gist
Why is it necessary to set k:=2 (line 62) and then use everywhere k - 2 (lines 63, 64, 47, 48? Why not just set k:=0?
artreven,
k=2 because he wants to get sets which contains 2 elements. k-2 because you need at least 2 elements in a set. You can also put k=3 in line 62.But then you will miss the sets which contains only 2 elements.
Hi Artreven,
Sorry, k=3 at line 62 won't work because the line 61 says explicitly L has only one element.
some issues running this code:
line 26: 'can' is initialized as a list and lists have no subsets. must be converted to set to run.
@Artevan: the reason he sets k as 2 is because he later sets the incremental L variable as Lk. Since L1 and C1 are initialized before the while loop, the number crunching starts at Lk where k=2. It's not a functional choice, only for reasons of bringing the code in line with the theory.

from Data Mining: Concepts and Techniques by Jiawei, Han et al.
I think line 27 should be between line 25 and line 26. Because each item value in sscnt should be setdefalut to 0 even if it does not appear in any dataset.