Skip to content

Instantly share code, notes, and snippets.

@alitrack
Forked from glamp/customer-segmentation.py
Created May 3, 2017 08:37
Show Gist options
  • Save alitrack/c2be103b7f967f98e7b577111abf7d44 to your computer and use it in GitHub Desktop.
Save alitrack/c2be103b7f967f98e7b577111abf7d44 to your computer and use it in GitHub Desktop.
Analysis for customer segmentation blog post
import pandas as pd
# http://blog.yhathq.com/static/misc/data/WineKMC.xlsx
df_offers = pd.read_excel("./WineKMC.xlsx", sheetname=0)
df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"]
df_offers.head()
df_transactions = pd.read_excel("./WineKMC.xlsx", sheetname=1)
df_transactions.columns = ["customer_name", "offer_id"]
df_transactions['n'] = 1
df_transactions.head()
# join the offers and transactions table
df = pd.merge(df_offers, df_transactions)
# create a "pivot table" which will give us the number of times each
# customer responded to a given variable
matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')
# a little tidying up. fill NA values with 0 and make the index into a column
matrix = matrix.fillna(0).reset_index()
x_cols = matrix.columns[1:]
from sklearn.cluster import KMeans
cluster = KMeans(n_clusters=5)
# slice matrix so we only include the 0/1 indicator columns in the clustering
matrix['cluster'] = cluster.fit_predict(matrix[x_cols])
matrix.cluster.value_counts()
from ggplot import *
ggplot(matrix, aes(x='factor(cluster)')) + geom_bar() + xlab("Cluster") + ylab("Customers\n(# in cluster)")
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]
matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]
matrix = matrix.reset_index()
customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]
customer_clusters.head()
df = pd.merge(df_transactions, customer_clusters)
df = pd.merge(df_offers, df)
from ggplot import *
ggplot(df, aes(x='x', y='y', color='cluster')) + \
geom_point(size=75) + \
ggtitle("Customers Grouped by Cluster")
cluster_centers = pca.transform(cluster.cluster_centers_)
cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
cluster_centers['cluster'] = range(0, len(cluster_centers))
ggplot(df, aes(x='x', y='y', color='cluster')) + \
geom_point(size=75) + \
geom_point(cluster_centers, size=500) +\
ggtitle("Customers Grouped by Cluster")
df['is_4'] = df.cluster==4
df.groupby("is_4").varietal.value_counts()
df.groupby("is_4")[['min_qty', 'discount']].mean()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment