Skip to content

Instantly share code, notes, and snippets.

@cnmoro
Created April 30, 2025 19:42
Show Gist options
  • Save cnmoro/be96473243f413c214f30058080a5752 to your computer and use it in GitHub Desktop.
Save cnmoro/be96473243f413c214f30058080a5752 to your computer and use it in GitHub Desktop.
Sequence Mining Analysis
import pandas as pd
def sequence_mining_analysis(df, id_col, date_col, cat_col):
if not pd.api.types.is_datetime64_any_dtype(df[date_col]):
raise TypeError(f"Column '{date_col}' must be datetime.")
df = (df[[id_col, date_col, cat_col]]
.sort_values([id_col, date_col]))
grp = df.groupby(id_col)[cat_col]
df = df.assign(prev=grp.shift(), curr=df[cat_col], nxt=grp.shift(-1))
df = df.dropna(subset=['curr'])
raw_antec = pd.crosstab(df['curr'], df['prev'])
raw_conseq = pd.crosstab(df['prev'], df['curr'])
raw_antec = raw_antec.loc[:, (raw_antec != 0).any(axis=0)]
raw_antec = raw_antec[(raw_antec.T != 0).any()]
raw_conseq = raw_conseq.loc[:, (raw_conseq != 0).any(axis=0)]
raw_conseq = raw_conseq[(raw_conseq.T != 0).any()]
antec = raw_antec.div(raw_antec.sum(axis=1), axis=0).mul(100)
conseq = raw_conseq.div(raw_conseq.sum(axis=1), axis=0).mul(100)
results = []
for cat in df[cat_col].unique():
if cat in antec.index:
ant_items = antec.loc[cat].dropna()
ant_items = ant_items[ant_items > 0].sort_values(ascending=False)
antecedents = [{k: f"{v:.4f}%"} for k, v in ant_items.items()]
else:
antecedents = []
if cat in conseq.index:
cons_items = conseq.loc[cat].dropna()
cons_items = cons_items[cons_items > 0].sort_values(ascending=False)
consequents = [{k: f"{v:.4f}%"} for k, v in cons_items.items()]
else:
consequents = []
results.append({
"CATEGORY": cat,
"ANTECEDENTS": antecedents,
"CONSEQUENTS": consequents,
})
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment