Last active
April 18, 2023 10:45
-
-
Save htnminh/b9d0cf7db88b3c28a4a06658b8237d35 to your computer and use it in GitHub Desktop.
apriori VDT 2 - 31 Hoang Tran Nhat Minh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
import pandas as pd | |
PATH_TO_CSV = r"C:\Users\nhatm\OneDrive - Hanoi University of Science and Technology\Desktop\groceries.csv" | |
MIN_SUPPORT = 12 | |
def get_frequent_itemsets(transactions, min_support): | |
# Count the occurrence of each item | |
item_counts = {} | |
for transaction in transactions: | |
for item in transaction: | |
if item in item_counts: | |
item_counts[item] += 1 | |
else: | |
item_counts[item] = 1 | |
# Discard infrequent items | |
frequent_items = {frozenset([item]) for item, count in item_counts.items() | |
if count >= min_support} | |
# Generate candidate itemsets | |
itemsets = frequent_items.copy() | |
k = 2 | |
while True: | |
candidate_itemsets = set([itemset1.union(itemset2) | |
for itemset1 in itemsets | |
for itemset2 in itemsets | |
if len(itemset1.union(itemset2)) == k]) | |
if not candidate_itemsets: | |
break | |
# Count the occurrence of each candidate itemset | |
itemset_counts = dict.fromkeys(candidate_itemsets, 0) | |
for transaction in transactions: | |
for itemset in candidate_itemsets: | |
if itemset.issubset(transaction): | |
itemset_counts[itemset] += 1 | |
# Discard infrequent itemsets | |
frequent_itemsets = {itemset for itemset, count in itemset_counts.items() | |
if count >= min_support} | |
if not frequent_itemsets: | |
break | |
# Add the frequent itemsets to the output | |
frequent_items.update(frequent_itemsets) | |
itemsets = frequent_itemsets | |
k += 1 | |
return frequent_items | |
# Test only | |
pprint.pprint( | |
get_frequent_itemsets( | |
[ | |
{'A', 'B', 'D', 'E'}, | |
{'B', 'C', 'E'}, | |
{'A', 'B', 'D', 'E'}, | |
{'A', 'B', 'C', 'E'}, | |
{'A', 'B', 'C', 'D', 'E'}, | |
{'B', 'C', 'D'} | |
], | |
3 | |
) | |
) | |
print() | |
# Main program | |
df = pd.read_csv(PATH_TO_CSV, header=0, index_col=False) | |
print(df.head()) | |
# Convert the dataframe to a set of transactions | |
transactions = [] | |
for i in range(df.shape[0]): | |
transactions.append(set(df.iloc[i, 1: df.iloc[i, 0] + 1])) | |
pprint.pprint(transactions[0:5]) | |
print() | |
print(f'For min_support={MIN_SUPPORT}:') | |
pprint.pprint(get_frequent_itemsets(transactions, MIN_SUPPORT)) | |
"""Output: | |
{frozenset({'A', 'E', 'B'}), | |
frozenset({'E', 'B', 'C'}), | |
frozenset({'D'}), | |
frozenset({'A'}), | |
frozenset({'A', 'D'}), | |
frozenset({'B'}), | |
frozenset({'A', 'B'}), | |
frozenset({'B', 'D'}), | |
frozenset({'A', 'D', 'B'}), | |
frozenset({'E'}), | |
frozenset({'E', 'C'}), | |
frozenset({'E', 'B'}), | |
frozenset({'E', 'D'}), | |
frozenset({'E', 'B', 'D'}), | |
frozenset({'C'}), | |
frozenset({'B', 'C'}), | |
frozenset({'E', 'A'}), | |
frozenset({'A', 'E', 'D'}), | |
frozenset({'A', 'B', 'E', 'D'})} | |
Item(s) Item 1 Item 2 Item 3 Item 4 Item 5 Item 6 Item 7 Item 8 Item 9 ... Item 23 Item 24 Item 25 Item 26 Item 27 Item 28 Item 29 Item 30 Item 31 Item 32 | |
0 4 citrus fruit semi-finished bread margarine ready soups NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN | |
1 3 tropical fruit yogurt coffee NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN | |
2 1 whole milk NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN | |
3 4 pip fruit yogurt cream cheese meat spreads NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN | |
4 4 other vegetables whole milk condensed milk long life bakery product NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN | |
[5 rows x 33 columns] | |
[{'citrus fruit', 'semi-finished bread', 'margarine', 'ready soups'}, | |
{'tropical fruit', 'yogurt', 'coffee'}, | |
{'whole milk'}, | |
{'meat spreads', 'yogurt', 'cream cheese', 'pip fruit'}, | |
{'condensed milk', | |
'long life bakery product', | |
'other vegetables', | |
'whole milk'}] | |
For min_support=12: | |
{frozenset({'rolls/buns', 'whole milk'}), | |
frozenset({'citrus fruit'}), | |
frozenset({'curd'}), | |
frozenset({'shopping bags'}), | |
frozenset({'bottled water'}), | |
frozenset({'whole milk'}), | |
frozenset({'fruit/vegetable juice'}), | |
frozenset({'yogurt'}), | |
frozenset({'coffee'}), | |
frozenset({'frankfurter'}), | |
frozenset({'root vegetables'}), | |
frozenset({'tropical fruit'}), | |
frozenset({'soda'}), | |
frozenset({'sugar'}), | |
frozenset({'newspapers'}), | |
frozenset({'rolls/buns'}), | |
frozenset({'pastry'}), | |
frozenset({'canned beer'}), | |
frozenset({'bottled beer'}), | |
frozenset({'sausage'}), | |
frozenset({'other vegetables'}), | |
frozenset({'whole milk', 'other vegetables'})} | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment