-
-
Save marcelcaraciolo/1423287 to your computer and use it in GitHub Desktop.
#-*- coding:utf-8 - *- | |
def load_dataset(): | |
"Load the sample dataset." | |
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] | |
def createC1(dataset): | |
"Create a list of candidate item sets of size one." | |
c1 = [] | |
for transaction in dataset: | |
for item in transaction: | |
if not [item] in c1: | |
c1.append([item]) | |
c1.sort() | |
#frozenset because it will be a ket of a dictionary. | |
return map(frozenset, c1) | |
def scanD(dataset, candidates, min_support): | |
"Returns all candidates that meets a minimum support level" | |
sscnt = {} | |
for tid in dataset: | |
for can in candidates: | |
if can.issubset(tid): | |
sscnt.setdefault(can, 0) | |
sscnt[can] += 1 | |
num_items = float(len(dataset)) | |
retlist = [] | |
support_data = {} | |
for key in sscnt: | |
support = sscnt[key] / num_items | |
if support >= min_support: | |
retlist.insert(0, key) | |
support_data[key] = support | |
return retlist, support_data | |
def aprioriGen(freq_sets, k): | |
"Generate the joint transactions from candidate sets" | |
retList = [] | |
lenLk = len(freq_sets) | |
for i in range(lenLk): | |
for j in range(i + 1, lenLk): | |
L1 = list(freq_sets[i])[:k - 2] | |
L2 = list(freq_sets[j])[:k - 2] | |
L1.sort() | |
L2.sort() | |
if L1 == L2: | |
retList.append(freq_sets[i] | freq_sets[j]) | |
return retList | |
def apriori(dataset, minsupport=0.5): | |
"Generate a list of candidate item sets" | |
C1 = createC1(dataset) | |
D = map(set, dataset) | |
L1, support_data = scanD(D, C1, minsupport) | |
L = [L1] | |
k = 2 | |
while (len(L[k - 2]) > 0): | |
Ck = aprioriGen(L[k - 2], k) | |
Lk, supK = scanD(D, Ck, minsupport) | |
support_data.update(supK) | |
L.append(Lk) | |
k += 1 | |
return L, support_data |
Why is it necessary to set k:=2 (line 62) and then use everywhere k - 2 (lines 63, 64, 47, 48? Why not just set k:=0?
artreven,
k=2 because he wants to get sets which contains 2 elements. k-2 because you need at least 2 elements in a set. You can also put k=3 in line 62.But then you will miss the sets which contains only 2 elements.
Hi Artreven,
Sorry, k=3 at line 62 won't work because the line 61 says explicitly L has only one element.
some issues running this code:
line 26: 'can' is initialized as a list and lists have no subsets. must be converted to set to run.
@Artevan: the reason he sets k as 2 is because he later sets the incremental L variable as Lk. Since L1 and C1 are initialized before the while loop, the number crunching starts at Lk where k=2. It's not a functional choice, only for reasons of bringing the code in line with the theory.
from Data Mining: Concepts and Techniques by Jiawei, Han et al.
sccnt = defaultdict(int), which yields 0, or default(lambda : 0), would be a little cleaner and more idiomatic.
Overall though, awesome article and gist