Created
February 27, 2020 15:57
-
-
Save BioSciEconomist/2540f354cc30e4685b4ef6407e27a4e9 to your computer and use it in GitHub Desktop.
dc customer segmentation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: dc customer segmentation | |
# | DATE: 5/17/19 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE:/Users/amandabogard/Google Drive/Python Scripts | |
# *---------------------------------------------------------------- | |
# | PURPOSE: code from Data Camp course | |
# *---------------------------------------------------------------- | |
import pandas as pd | |
import datetime as dt | |
# ---------------------------------------------- | |
# Cohort Analysis | |
# --------------------------------------------- | |
# read data | |
online = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_1/online.csv') | |
online.head() | |
online.columns | |
online.info() | |
# Convert df['date'] from string to datetime | |
# Prepare a format string: time_format | |
time_format = '%Y-%m-%d %H:%M' | |
# Convert date_list into a datetime object: my_datetimes | |
online['InvoiceDate'] = pd.to_datetime(online['InvoiceDate'], format=time_format) | |
online.info() | |
### Assign daily acquisition cohort | |
# Define a function that will parse the date | |
def get_day(x): return dt.datetime(x.year, x.month, x.day) | |
# Create InvoiceDay column | |
online['InvoiceDay'] = online['InvoiceDate'].apply(get_day) | |
# Group by CustomerID and select the InvoiceDay value | |
grouping = online.groupby('CustomerID')['InvoiceDay'] | |
# Assign a minimum InvoiceDay value to the dataset | |
online['CohortDay'] = grouping.transform('min') | |
### get cohort month | |
def get_month(x): return dt.datetime(x.year, x.month, 1) # define function to get month | |
online['InvoiceMonth'] = online['InvoiceDate'].apply(get_month) # create InvoiceMonth | |
grouping = online.groupby('CustomerID')['InvoiceMonth'] # create grouping by month | |
online['CohortMonth'] = grouping.transform('min') # get first month as CohortMonth | |
# View the top 5 rows | |
print(online.head()) | |
### lets look at the data for a customer 12747 | |
subset = (online['CustomerID'] == 12747) # create subset logic | |
tmp = online[subset] # subset data | |
tmp = tmp[['CustomerID','InvoiceDay','CohortDay','InvoiceMonth','CohortMonth']] # look only at date columns | |
tmp = tmp.sort_values(['InvoiceDay']) # sort by invoice date | |
print(tmp) # verify that each person's cohort day is based on thier 1st invoice | |
### Calculate time offset in days - part 1 | |
def get_date_int(df, column): | |
year = df[column].dt.year | |
month = df[column].dt.month | |
day = df[column].dt.day | |
return year, month, day | |
# Get the integers for date parts from the `InvoiceDay` column | |
invoice_year,invoice_month,invoice_day = get_date_int(online,'InvoiceDay') | |
# Get the integers for date parts from the `CohortDay` column | |
cohort_year, cohort_month, cohort_day = get_date_int(online, 'CohortDay') | |
### Calculate time offset in days - part 2 | |
# Calculate difference in years | |
years_diff = invoice_year - cohort_year | |
# Calculate difference in months | |
months_diff = invoice_month - cohort_month | |
# Calculate difference in days | |
days_diff = invoice_day - cohort_day | |
# Extract the difference in days from all previous values | |
online['CohortIndex'] = years_diff * 365 + months_diff * 30 + days_diff + 1 | |
print(online.head()) | |
### check our test case | |
subset = (online['CustomerID'] == 12747) # create subset logic | |
tmp = online[subset] # subset data | |
tmp = tmp[['CustomerID','InvoiceDay','CohortDay','CohortIndex']] # look only at date columns | |
tmp = tmp.sort_values(['InvoiceDay']) # sort by invoice date | |
print(tmp) # verify that each person's cohort day is based on thier 1st invoice | |
### Calculate retention rate from scratch | |
# create grouping object | |
grouping = online.groupby(['CohortMonth', 'CohortIndex']) | |
grouping.first() #check | |
# Count the number of unique values per customer ID | |
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique).reset_index() | |
# Create a pivot | |
cohort_counts = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='CustomerID') | |
# Select the first column and store it to cohort_sizes | |
cohort_sizes = cohort_counts.iloc[:,0] | |
# Divide the cohort count by cohort sizes along the rows | |
retention = cohort_counts.divide(cohort_sizes, axis=0) | |
### check | |
tmp = online[['CustomerID','CohortMonth','CohortIndex' ] ]# copy & subset data | |
tmp_grp = tmp.groupby(['CohortMonth', 'CohortIndex']) # create grouping object | |
tmp_grp.first() # check | |
### Calculate average price | |
# Create a groupby object and pass the monthly cohort and cohort index as a list | |
grouping = online.groupby(['CohortMonth', 'CohortIndex']) | |
# Calculate the average of the unit price | |
cohort_data = grouping['UnitPrice'].mean() | |
# Reset the index of cohort_data | |
cohort_data = cohort_data.reset_index() | |
# Create a pivot | |
average_quantity = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='UnitPrice') | |
print(average_quantity.round(1)) | |
### Visualize average quantity metric | |
import matplotlib.pyplot as plt | |
# Import seaborn package as sns | |
import seaborn as sns | |
# Initialize an 8 by 6 inches plot figure | |
plt.figure(figsize=(8,6)) | |
# Add a title | |
plt.title('Average Spend by Monthly Cohorts') | |
# Create the heatmap | |
sns.heatmap(data=average_quantity, annot=True, cmap='Blues') | |
plt.show() | |
# ---------------------------------------------- | |
# Recency, Frequency, Monetary Value analysis | |
# --------------------------------------------- | |
# Create a spend quartile with 4 groups - a range between 1 and 5 | |
spend_quartile = pd.qcut(data['Spend'], q=4, labels=range(1,5)) | |
# Assign the quartile values to the Spend_Quartile column in data | |
data['Spend_Quartile'] = spend_quartile | |
# Print data with sorted Spend values | |
print(data.sort_values('Spend')) | |
### Calculate Recency deciles (q=10) | |
# Store labels from 4 to 1 in a decreasing order | |
r_labels = list(range(4, 0, -1)) | |
# Create a spend quartile with 4 groups and pass the previously created labels | |
recency_quartiles = pd.qcut(data['Recency_Days'], q=4, labels=r_labels) | |
# Assign the quartile values to the Recency_Quartile column in `data` | |
data['Recency_Quartile'] = recency_quartiles | |
# Print `data` with sorted Recency_Days values | |
print(data.sort_values('Recency_Days')) | |
### Calculate RFM values | |
# Calculate Recency, Frequency and Monetary values for the online dataset we have used | |
# before - it has been loaded for you with recent 12 months of data. There's a TotalSum | |
# column in the online dataset which has been calculated by multiplying Quantity and | |
# UnitPrice: online['Quantity'] * online['UnitPrice'] | |
# Also, we have created a snapshot_date variable that you can use to calculate recency. | |
# Feel free to print the online dataset and the snapshot_date into the Console. | |
# The pandas library is loaded as pd, and datetime as dt | |
# read data | |
online12m = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_2/online12M.csv') | |
online12m.info() # check data formats | |
# Convert df['date'] from string to datetime | |
# Prepare a format string: time_format | |
time_format = '%Y-%m-%d %H:%M' | |
# Convert date_list into a datetime object | |
online12m['InvoiceDate'] = pd.to_datetime(online12m['InvoiceDate'], format=time_format) | |
online.info() # check | |
# create total sum metric | |
online12m['TotalSum'] = online12m['Quantity'] * online12m['UnitPrice'] | |
# add snapshot date - most recent date in this historical file + 1 | |
online12m['snapshot_date'] = max(online12m.InvoiceDate) + dt.timedelta(days=1) | |
# create snapshot date value | |
snapshot_date = max(online12m.InvoiceDate) + dt.timedelta(days=1) | |
# Calculate Recency, Frequency and Monetary value for each customer | |
datamart = online12m.groupby(['CustomerID']).agg({ | |
'InvoiceDate': lambda x: (snapshot_date - x.max()).days, | |
'InvoiceNo': 'count', | |
'TotalSum': 'sum'}) | |
# Rename the columns | |
datamart.rename(columns={'InvoiceDate': 'Recency', | |
'InvoiceNo': 'Frequency', | |
'TotalSum': 'MonetaryValue'}, inplace=True) | |
# Print top 5 rows | |
print(datamart.head()) | |
### Calculate 3 groups for Recency and Frequency | |
# Create labels for Recency and Frequency | |
r_labels = range(3,0,-1); f_labels = range(1,4) | |
# Assign these labels to three equal percentile groups | |
r_groups = pd.qcut(datamart['Recency'], q=3, labels=r_labels) | |
# Assign these labels to three equal percentile groups | |
f_groups = pd.qcut(datamart['Frequency'], q=3, labels=f_labels) | |
# Create new columns R and F | |
datamart = datamart.assign(R = r_groups.values, F = f_groups.values) | |
### Calculate RFM Score | |
# Create labels for MonetaryValue | |
m_labels = range(1,4) | |
# Assign these labels to three equal percentile groups | |
m_groups = pd.qcut(datamart['MonetaryValue'], q=3, labels=m_labels) | |
# Create new column M | |
datamart = datamart.assign(M = m_groups.values) | |
# Calculate RFM_Score | |
datamart['RFM_Score'] = datamart[['R','F','M']].sum(axis=1) | |
print(datamart['RFM_Score'].head()) | |
### Concatenate RFM quartile values to RFM_Segment | |
# define function to concatenate RFM values | |
def join_rfm(x): return str(x['R']) + str(x['F']) + str(x['M']) | |
datamart['RFM_Segment'] = datamart.apply(join_rfm, axis=1) | |
### explore segments | |
# look at largest segments | |
datamart.groupby('RFM_Segment').size().sort_values(ascending=False)[:10] | |
# Select bottom RFM segment "111" and view top 5 rows | |
datamart[datamart['RFM_Segment']=='111'][:5] | |
# Summary metrics per RFM Score | |
datamart.groupby('RFM_Score').agg({ | |
'Recency': 'mean', | |
'Frequency': 'mean', | |
'MonetaryValue': ['mean','count'] }).round(1) | |
### Creating custom segments | |
# Define rfm_level function | |
def rfm_level(df): | |
if df['RFM_Score'] >= 10: | |
return 'Top' | |
elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 10)): | |
return 'Middle' | |
else: | |
return 'Low' | |
# Create a new variable RFM_Level | |
datamart['RFM_Level'] = datamart.apply(rfm_level, axis=1) | |
# Print the header with top 5 rows to the console | |
print(datamart.head()) | |
### Analyzing custom segments | |
# Calculate average values for each RFM_Level, and return a size of each segment | |
rfm_level_agg = datamart.groupby('RFM_Level').agg({ | |
'Recency': 'mean', | |
'Frequency': 'mean', | |
# Return the size of each segment | |
'MonetaryValue': ['mean', 'count'] | |
}).round(1) | |
# Print the aggregated dataset | |
print(rfm_level_agg) | |
#-------------------------------------------- | |
# Data pre-processing for clustering | |
#-------------------------------------------- | |
### Calculate statistics of variables | |
### examples | |
# Print the average values of the variables in the dataset | |
print(datamart.mean()) | |
# Print the standard deviation of the variables in the dataset | |
print(datamart.std()) | |
# Use `describe` function to get key statistics of the dataset | |
print(datamart.describe()) | |
### Detect skewed variables | |
# import seaborn as sns | |
# from matplotlib import pyplot as plt | |
sns.distplot(datamart['Recency']) | |
plt.show() | |
### examples | |
# Plot distribution of var1 | |
plt.subplot(3, 1, 1); sns.distplot(data['var1']) | |
# Plot distribution of var2 | |
plt.subplot(3, 1, 2); sns.distplot(data['var2']) | |
# Plot distribution of var3 | |
plt.subplot(3, 1, 3); sns.distplot(data['var3']) | |
# Show the plot | |
plt.show() | |
### manage skewness | |
# example: | |
# Apply log transformation to var2 | |
data['var2_log'] = np.log(data['var2']) | |
# Apply log transformation to var3 | |
data['var3_log'] = np.log(data['var3']) | |
# Create a subplot of the distribution of var2_log | |
plt.subplot(2, 1, 1); sns.distplot(data['var2_log']) | |
# Create a subplot of the distribution of var3_log | |
plt.subplot(2, 1, 2); sns.distplot(data['var3_log']) | |
# Show the plot | |
plt.show() | |
### Center and scale manually | |
# example: | |
# Center the data by subtracting average values from each entry | |
data_centered = data - data.mean() | |
# Scale the data by dividing each entry by standard deviation | |
data_scaled = data / data.std() | |
# Normalize the data by applying both centering and scaling | |
data_normalized = (data - data.mean()) / data.std() | |
# Print summary statistics to make sure average is zero and standard deviation is one | |
print(data_normalized.describe().round(2)) | |
### Center and scale with StandardScaler() | |
# example: | |
# Initialize a scaler | |
scaler = StandardScaler() | |
# Fit the scaler | |
scaler.fit(data) | |
# Scale and center the data | |
data_normalized = scaler.transform(data) | |
# Create a pandas DataFrame | |
data_normalized = pd.DataFrame(data_normalized, index=data.index, columns=data.columns) | |
# Print summary statistics | |
print(data_normalized.describe().round(2)) | |
### Visualize RFM distributions | |
# read data | |
datamart_rfm = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_3/rfm_datamart.csv') | |
datamart_rfm .info() # check data formats | |
datamart_rfm .describe() | |
# Plot recency distribution | |
plt.subplot(3, 1, 1); sns.distplot(datamart_rfm['Recency']) | |
# Plot frequency distribution | |
plt.subplot(3, 1, 2); sns.distplot(datamart_rfm['Frequency']) | |
# Plot monetary value distribution | |
plt.subplot(3, 1, 3); sns.distplot(datamart_rfm['MonetaryValue']) | |
# Show the plot | |
plt.show() | |
### Pre-process RFM data | |
import numpy as np | |
from sklearn.preprocessing import StandardScaler | |
# Unskew the data | |
datamart_log = np.log(datamart_rfm) | |
# Initialize a standard scaler and fit it | |
scaler = StandardScaler() | |
scaler.fit(datamart_log) | |
# Scale and center the data | |
datamart_normalized = scaler.transform(datamart_log) | |
# Create a pandas DataFrame | |
datamart_normalized = pd.DataFrame(data=datamart_normalized, index=datamart_rfm.index, columns=datamart_rfm.columns) | |
### Visualize the normalized variables | |
# Plot recency distribution | |
plt.subplot(3, 1, 1); sns.distplot(datamart_normalized['Recency']) | |
# Plot frequency distribution | |
plt.subplot(3, 1, 2); sns.distplot(datamart_normalized['Frequency']) | |
# Plot monetary value distribution | |
plt.subplot(3, 1, 3); sns.distplot(datamart_normalized['MonetaryValue']) | |
# Show the plot | |
plt.show() | |
#----------------------------------------- | |
# Customer Segmentation with K-means | |
#----------------------------------------- | |
# Import KMeans | |
from sklearn.cluster import KMeans | |
# Initialize KMeans | |
kmeans = KMeans(n_clusters=3, random_state=1) | |
# Fit k-means clustering on the normalized data set | |
kmeans.fit(datamart_normalized) | |
# Extract cluster labels | |
cluster_labels = kmeans.labels_ | |
### Assign labels to raw data | |
# Create a DataFrame by adding a new cluster label column | |
datamart_rfm_k3 = datamart_rfm.assign(Cluster=cluster_labels) | |
# Group the data by cluster | |
grouped = datamart_rfm_k3.groupby(['Cluster']) | |
# Calculate average RFM values and segment sizes per cluster value | |
grouped.agg({ | |
'Recency': 'mean', | |
'Frequency': 'mean', | |
'MonetaryValue': ['mean', 'count'] | |
}).round(1) | |
### Calculate sum of squared errors | |
# example: | |
# Fit KMeans and calculate SSE for each k | |
for k in range(1, 21): | |
# Initialize KMeans with k clusters | |
kmeans = KMeans(n_clusters=k, random_state=1) | |
# Fit KMeans on the normalized dataset | |
kmeans.fit(data_normalized) | |
# Assign sum of squared distances to k element of dictionary | |
sse[k] = kmeans.inertia_ | |
### Plot sum of squared errors | |
# example: | |
# Add the plot title "The Elbow Method" | |
plt.title('The Elbow Method') | |
# Add X-axis label "k" | |
plt.xlabel('k') | |
# Add Y-axis label "SSE" | |
plt.ylabel('SSE') | |
# Plot SSE values for each key in the dictionary | |
sns.pointplot(x=list(sse.keys()), y=list(sse.values())) | |
plt.show() | |
### Prepare data for the snake plot | |
# example: | |
# Melt the normalized dataset and reset the index | |
datamart_melt = pd.melt(datamart_normalized.reset_index(), | |
# Assign CustomerID and Cluster as ID variables | |
id_vars=['CustomerID', 'Cluster'], | |
# Assign RFM values as value variables | |
value_vars=['Recency', 'Frequency', 'MonetaryValue'], | |
# Name the variable and value | |
var_name='Metric', value_name='Value' | |
) | |
### Visualize snake plot | |
# Add the plot title | |
plt.title('Snake plot of normalized variables') | |
# Add the x axis label | |
plt.xlabel('Metric') | |
# Add the y axis label | |
plt.ylabel('Value') | |
# Plot a line for each value of the cluster variable | |
sns.lineplot(data =datamart_melt, x='Metric', y='Value', hue='Cluster') | |
plt.show() | |
### Calculate relative importance | |
# example: | |
# Calculate average RFM values for each cluster | |
cluster_avg = datamart_rfm_k3.groupby(['Cluster']).mean() | |
# Calculate average RFM values for the total customer population | |
population_avg = datamart_rfm.mean() | |
# Calculate relative importance of cluster's attribute value compared to population | |
relative_imp = cluster_avg / population_avg - 1 | |
# Print relative importance scores rounded to 2 decimals | |
print(relative_imp.round(2)) | |
### Relative importance heatmap | |
# Initialize a plot with a figure size of 8 by 2 inches | |
plt.figure(figsize=(8,2)) | |
# Add the plot title | |
plt.title('Relative importance of attributes') | |
# Plot the heatmap | |
sns.heatmap(data=relative_imp, annot=True,fmt ='.2f',cmap ='RdYlGn') | |
plt.show() | |
# ---- End to end customer segmentation solution ----- # | |
datamart_rfmt = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_4/datamart_rfmt.csv') | |
### Pre-process data | |
# Import StandardScaler | |
from sklearn.preprocessing import StandardScaler | |
# Apply log transformation | |
datamart_rfmt_log = np.log(datamart_rfmt) | |
# Initialize StandardScaler and fit it | |
scaler = StandardScaler(); scaler.fit(datamart_rfmt_log) | |
# Transform and store the scaled data as datamart_rfmt_normalized | |
datamart_rfmt_normalized = scaler.transform(datamart_rfmt_log) | |
### Calculate and plot sum of squared errors | |
# Fit KMeans and calculate SSE for each k between 1 and 10 | |
for k in range(1, 11): | |
# Initialize KMeans with k clusters and fit it | |
kmeans = KMeans(n_clusters=k, random_state=1).fit(datamart_rfmt_normalized) | |
# Assign sum of squared distances to k element of the sse dictionary | |
sse[k] = kmeans.inertia_ | |
# Add the plot title, x and y axis labels | |
plt.title('The Elbow Method'); plt.xlabel('k'); plt.ylabel('SSE') | |
# Plot SSE values for each k stored as keys in the dictionary | |
sns.pointplot(x=list(sse.keys()), y=list(sse.values())) | |
plt.show() | |
### Build 4-cluster solution | |
# Import KMeans | |
from sklearn.cluster import KMeans | |
# Initialize KMeans | |
kmeans = KMeans(n_clusters=4,random_state=1) | |
# Fit k-means clustering on the normalized data set | |
kmeans.fit(datamart_rfmt_normalized) | |
# Extract cluster labels | |
cluster_labels = kmeans.labels_ | |
### Analyze the segments | |
# Create a new DataFrame by adding a cluster label column to datamart_rfmt | |
datamart_rfmt_k4 = datamart_rfmt.assign(Cluster=cluster_labels) | |
# Group by cluster | |
grouped = datamart_rfmt_k4.groupby(['Cluster']) | |
# Calculate average RFMT values and segment sizes for each cluster | |
grouped.agg({ | |
'Recency': 'mean', | |
'Frequency': 'mean', | |
'MonetaryValue': 'mean', | |
'Tenure': ['mean', 'count'] | |
}).round(1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment