Last active
June 7, 2020 07:02
-
-
Save richiefrost/f86bcf615a1fd838b8a69fe290f5b54a to your computer and use it in GitHub Desktop.
RFM Clustering example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.cluster import KMeans | |
df = pd.read_csv('user_history.csv') | |
# Pandas' cut method groups continuous values into equal-sized bins | |
df['Frequency'] = pd.cut(df['RequestsPerMonth'], bins=4) | |
# Since lower recency is better, we need to reverse the order of the bins | |
df['Recency'] = pd.cut(df['DaysSinceLastRequest'], bins=4) | |
df['Recency'] = df['Recency'].replace({ | |
3: 0, | |
2: 1, | |
1: 2, | |
0: 3 | |
}) | |
df['MonetaryValue'] = pd.cut(df['OrderTotal'], bins=4) | |
# Calculate value score based on these metrics: | |
df['ValueScore'] = df[['Frequency', 'Recency', 'MonetaryValue']].sum(axis=1) | |
# Cluster users with K-means based on their value score by High, Medium, and Low value | |
# Indices of clusters are random, so we'll need to order them by value | |
model = KMeans(n_clusters=3) | |
df['ValueCluster'] = model.fit_predict(df[['ValueScore']]) | |
# Get the 1-D centroid of each cluster | |
centroids = df.groupby('ValueCluster')['ValueScore'].mean().reset_index(drop=True) | |
# Mapping should be in order of value | |
cluster_mapping = np.argsort(centroids) | |
# Replace random cluster indices with value-ordered indices | |
df['ValueCluster'] = df['ValueCluster'] \ | |
.replace({val: key for key, val in enumerate(cluster_mapping)}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment