Created
December 29, 2014 07:57
-
-
Save Laisky/ae34093e6fca254e645c to your computer and use it in GitHub Desktop.
Apriori_with_hconfidence
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import datetime | |
from pprint import pprint | |
from collections import defaultdict | |
import numpy as np | |
import pandas as pd | |
def apriori(dataset, min_support=0.5, min_hconf=0.5, verbose=False): | |
"""Implements the Apriori algorithm. | |
The Apriori algorithm will iteratively generate new candidate | |
k-itemsets using the frequent (k-1)-itemsets found in the previous | |
iteration. | |
Parameters | |
---------- | |
dataset : list | |
The dataset (a list of transactions) from which to generate | |
candidate itemsets. | |
min_support : float | |
The minimum support threshold. Defaults to 0.5. | |
Returns | |
------- | |
F : list | |
The list of frequent itemsets. | |
support_data : dict | |
The support data for all candidate itemsets. | |
References | |
---------- | |
.. [1] R. Agrawal, R. Srikant, "Fast Algorithms for Mining Association | |
Rules", 1994. | |
""" | |
C1 = create_candidates(dataset) # generate 1-item sets | |
D = list(map(set, dataset)) | |
support_data = {} | |
# prune candidate 1-itemsets | |
F1 = support_prune(D, C1, min_support, min_hconf, | |
support_data, verbose=verbose) | |
F = [F1] # list of frequent itemsets; initialized to frequent 1-itemsets | |
k = 2 # the itemset cardinality | |
while (len(F[k - 2]) > 0): | |
Ck = apriori_gen(F[k - 2], k) # generate candidate itemsets | |
# prune candidate itemsets | |
Fk = support_prune(D, Ck, min_support, min_hconf, | |
support_data, verbose=verbose) | |
# add the pruned candidate itemsets to the list of frequent itemsets | |
F.append(Fk) | |
k += 1 | |
if verbose: | |
# Print a list of all the frequent itemsets. | |
for kset in F: | |
for item in kset: | |
print(item, round(support_data[item], 3)) | |
return F, support_data | |
def create_candidates(dataset, verbose=False): | |
"""Creates a list of candidate 1-itemsets from a list of transactions. | |
Parameters | |
---------- | |
dataset : list | |
The dataset (a list of transactions) from which to generate candidate | |
itemsets. | |
Returns | |
------- | |
The list of candidate itemsets (c1) passed as a frozenset (a set that is | |
immutable and hashable). | |
""" | |
c1 = set() # list of all items in the database of transactions | |
for transaction in dataset: | |
c1.update([frozenset([i]) for i in transaction]) | |
if verbose: | |
# Print a list of all the candidate items. | |
pprint(c1) | |
# Map c1 to a frozenset because it will be the key of a dictionary. | |
return c1 | |
def support_prune(dataset, candidates, min_support, min_hconf, | |
support_data, verbose=False): | |
"""Returns all candidate itemsets that meet a minimum support threshold. | |
By the apriori principle, if an itemset is frequent, then all of its | |
subsets must also be frequent. As a result, we can perform support-based | |
pruning to systematically control the exponential growth of candidate | |
itemsets. Thus, itemsets that do not meet the minimum support level are | |
pruned from the input list of itemsets (dataset). | |
Parameters | |
---------- | |
dataset : list | |
The dataset (a list of transactions) from which to generate candidate | |
itemsets. | |
candidates : frozenset | |
The list of candidate itemsets. | |
min_support : float | |
The minimum support threshold. | |
support_data : dict | |
The support data for all candidate itemsets. | |
Returns | |
------- | |
retlist : list | |
The list of frequent itemsets. | |
""" | |
# updata support data | |
sscnt = defaultdict(lambda: 0) # set for support counts | |
for tid in dataset: | |
for cand in candidates: | |
if cand.issubset(tid): | |
sscnt[cand] += 1 | |
# total number of transactions in the dataset | |
n_all_items = len(dataset) | |
retlist = [] # array for unpruned itemsets | |
for cand, n_supp in sscnt.items(): | |
# Calculate the support of itemset cand. | |
support = n_supp / n_all_items | |
if len(cand) > 1: | |
# Calculate h-confidence | |
hconf = support / max([support_data[frozenset([_])] for _ in cand]) | |
else: | |
hconf = 1 | |
if support >= min_support and hconf >= min_hconf: | |
retlist.insert(0, cand) | |
support_data[cand] = support | |
# Print a list of the pruned itemsets. | |
if verbose: | |
pprint(retlist) | |
for cand, cnt in sscnt.items(): | |
print(cand, cnt, support_data[cand]) | |
return retlist | |
def apriori_gen(freq_sets, k): | |
"""Generates candidate itemsets (via the F_k-1 x F_k-1 method). | |
This operation generates new candidate k-itemsets based on the frequent | |
(k-1)-itemsets found in the previous iteration. The candidate generation | |
procedure merges a pair of frequent (k-1)-itemsets only if their first k-2 | |
items are identical. | |
Parameters | |
---------- | |
freq_sets : list | |
The list of frequent (k-1)-itemsets. | |
k : integer | |
The cardinality of the current itemsets being evaluated. | |
Returns | |
------- | |
retlist : list | |
The list of merged frequent itemsets. | |
""" | |
retList = [] # list of merged frequent itemsets | |
lenLk = len(freq_sets) # number of frequent itemsets | |
freq_sets = [frozenset(sorted(_)) for _ in freq_sets] | |
for i in range(lenLk): | |
a = list(freq_sets[i]) | |
F1 = a[: k - 2] # first k-2 items of freq_sets[i] | |
for j in range(i + 1, lenLk): | |
b = list(freq_sets[j]) | |
F2 = b[: k - 2] # first k-2 items of freq_sets[j] | |
if F1 == F2: # if the first k-2 items are identical | |
# Merge the frequent itemsets. | |
retList.append(freq_sets[i] | freq_sets[j]) | |
return retList | |
def main(): | |
dataset = np.random.exponential(scale=10, size=(1000, 10)).astype(np.int64) | |
# print(dataset) | |
r = apriori(dataset, min_support=0.0, min_hconf=0.5, verbose=False) | |
pprint(r[0][1:]) | |
print(sum([len(_) for _ in r[0]])) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment