alitrack · May 3, 2017 08:37
diff --git a/customer-segmentation.py b/customer-segmentation.py
 import pandas as pd
 # http://blog.yhathq.com/static/misc/data/WineKMC.xlsx
 df_offers = pd.read_excel("./WineKMC.xlsx", sheetname=0)
 df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"]
 df_offers.head()

 df_transactions = pd.read_excel("./WineKMC.xlsx", sheetname=1)
 df_transactions.columns = ["customer_name", "offer_id"]
 df_transactions['n'] = 1
 df_transactions.head()

 # join the offers and transactions table
 df = pd.merge(df_offers, df_transactions)
 # create a "pivot table" which will give us the number of times each 
 # customer responded to a given variable
 matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')
 # a little tidying up. fill NA values with 0 and make the index into a column
 matrix = matrix.fillna(0).reset_index()
 x_cols = matrix.columns[1:]

 from sklearn.cluster import KMeans

 cluster = KMeans(n_clusters=5)
 # slice matrix so we only include the 0/1 indicator columns in the clustering
 matrix['cluster'] = cluster.fit_predict(matrix[x_cols])
 matrix.cluster.value_counts()

 from ggplot import *
 ggplot(matrix, aes(x='factor(cluster)')) + geom_bar() + xlab("Cluster") + ylab("Customers\n(# in cluster)")

 from sklearn.decomposition import PCA

 pca = PCA(n_components=2)
 matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]
 matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]
 matrix = matrix.reset_index()

 customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]
 customer_clusters.head()

 df = pd.merge(df_transactions, customer_clusters)
 df = pd.merge(df_offers, df)

 from ggplot import *

 ggplot(df, aes(x='x', y='y', color='cluster')) + \
    geom_point(size=75) + \
    ggtitle("Customers Grouped by Cluster")
    
 cluster_centers = pca.transform(cluster.cluster_centers_)
 cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
 cluster_centers['cluster'] = range(0, len(cluster_centers))

 ggplot(df, aes(x='x', y='y', color='cluster')) + \
    geom_point(size=75) + \
    geom_point(cluster_centers, size=500) +\
    ggtitle("Customers Grouped by Cluster")


 df['is_4'] = df.cluster==4
 df.groupby("is_4").varietal.value_counts()
 df.groupby("is_4")[['min_qty', 'discount']].mean()
	import pandas as pd
	# http://blog.yhathq.com/static/misc/data/WineKMC.xlsx
	df_offers = pd.read_excel("./WineKMC.xlsx", sheetname=0)
	df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"]
	df_offers.head()

	df_transactions = pd.read_excel("./WineKMC.xlsx", sheetname=1)
	df_transactions.columns = ["customer_name", "offer_id"]
	df_transactions['n'] = 1
	df_transactions.head()

	# join the offers and transactions table
	df = pd.merge(df_offers, df_transactions)
	# create a "pivot table" which will give us the number of times each
	# customer responded to a given variable
	matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')
	# a little tidying up. fill NA values with 0 and make the index into a column
	matrix = matrix.fillna(0).reset_index()
	x_cols = matrix.columns[1:]

	from sklearn.cluster import KMeans

	cluster = KMeans(n_clusters=5)
	# slice matrix so we only include the 0/1 indicator columns in the clustering
	matrix['cluster'] = cluster.fit_predict(matrix[x_cols])
	matrix.cluster.value_counts()

	from ggplot import *
	ggplot(matrix, aes(x='factor(cluster)')) + geom_bar() + xlab("Cluster") + ylab("Customers\n(# in cluster)")

	from sklearn.decomposition import PCA

	pca = PCA(n_components=2)
	matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]
	matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]
	matrix = matrix.reset_index()

	customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]
	customer_clusters.head()

	df = pd.merge(df_transactions, customer_clusters)
	df = pd.merge(df_offers, df)

	from ggplot import *

	ggplot(df, aes(x='x', y='y', color='cluster')) + \
	geom_point(size=75) + \
	ggtitle("Customers Grouped by Cluster")

	cluster_centers = pca.transform(cluster.cluster_centers_)
	cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
	cluster_centers['cluster'] = range(0, len(cluster_centers))

	ggplot(df, aes(x='x', y='y', color='cluster')) + \
	geom_point(size=75) + \
	geom_point(cluster_centers, size=500) +\
	ggtitle("Customers Grouped by Cluster")


	df['is_4'] = df.cluster==4
	df.groupby("is_4").varietal.value_counts()
	df.groupby("is_4")[['min_qty', 'discount']].mean()