Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created February 27, 2020 15:57
Show Gist options
  • Save BioSciEconomist/2540f354cc30e4685b4ef6407e27a4e9 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/2540f354cc30e4685b4ef6407e27a4e9 to your computer and use it in GitHub Desktop.
dc customer segmentation
# *-----------------------------------------------------------------
# | PROGRAM NAME: dc customer segmentation
# | DATE: 5/17/19
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:/Users/amandabogard/Google Drive/Python Scripts
# *----------------------------------------------------------------
# | PURPOSE: code from Data Camp course
# *----------------------------------------------------------------
import pandas as pd
import datetime as dt
# ----------------------------------------------
# Cohort Analysis
# ---------------------------------------------
# read data
online = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_1/online.csv')
online.head()
online.columns
online.info()
# Convert df['date'] from string to datetime
# Prepare a format string: time_format
time_format = '%Y-%m-%d %H:%M'
# Convert date_list into a datetime object: my_datetimes
online['InvoiceDate'] = pd.to_datetime(online['InvoiceDate'], format=time_format)
online.info()
### Assign daily acquisition cohort
# Define a function that will parse the date
def get_day(x): return dt.datetime(x.year, x.month, x.day)
# Create InvoiceDay column
online['InvoiceDay'] = online['InvoiceDate'].apply(get_day)
# Group by CustomerID and select the InvoiceDay value
grouping = online.groupby('CustomerID')['InvoiceDay']
# Assign a minimum InvoiceDay value to the dataset
online['CohortDay'] = grouping.transform('min')
### get cohort month
def get_month(x): return dt.datetime(x.year, x.month, 1) # define function to get month
online['InvoiceMonth'] = online['InvoiceDate'].apply(get_month) # create InvoiceMonth
grouping = online.groupby('CustomerID')['InvoiceMonth'] # create grouping by month
online['CohortMonth'] = grouping.transform('min') # get first month as CohortMonth
# View the top 5 rows
print(online.head())
### lets look at the data for a customer 12747
subset = (online['CustomerID'] == 12747) # create subset logic
tmp = online[subset] # subset data
tmp = tmp[['CustomerID','InvoiceDay','CohortDay','InvoiceMonth','CohortMonth']] # look only at date columns
tmp = tmp.sort_values(['InvoiceDay']) # sort by invoice date
print(tmp) # verify that each person's cohort day is based on thier 1st invoice
### Calculate time offset in days - part 1
def get_date_int(df, column):
year = df[column].dt.year
month = df[column].dt.month
day = df[column].dt.day
return year, month, day
# Get the integers for date parts from the `InvoiceDay` column
invoice_year,invoice_month,invoice_day = get_date_int(online,'InvoiceDay')
# Get the integers for date parts from the `CohortDay` column
cohort_year, cohort_month, cohort_day = get_date_int(online, 'CohortDay')
### Calculate time offset in days - part 2
# Calculate difference in years
years_diff = invoice_year - cohort_year
# Calculate difference in months
months_diff = invoice_month - cohort_month
# Calculate difference in days
days_diff = invoice_day - cohort_day
# Extract the difference in days from all previous values
online['CohortIndex'] = years_diff * 365 + months_diff * 30 + days_diff + 1
print(online.head())
### check our test case
subset = (online['CustomerID'] == 12747) # create subset logic
tmp = online[subset] # subset data
tmp = tmp[['CustomerID','InvoiceDay','CohortDay','CohortIndex']] # look only at date columns
tmp = tmp.sort_values(['InvoiceDay']) # sort by invoice date
print(tmp) # verify that each person's cohort day is based on thier 1st invoice
### Calculate retention rate from scratch
# create grouping object
grouping = online.groupby(['CohortMonth', 'CohortIndex'])
grouping.first() #check
# Count the number of unique values per customer ID
cohort_data = grouping['CustomerID'].apply(pd.Series.nunique).reset_index()
# Create a pivot
cohort_counts = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='CustomerID')
# Select the first column and store it to cohort_sizes
cohort_sizes = cohort_counts.iloc[:,0]
# Divide the cohort count by cohort sizes along the rows
retention = cohort_counts.divide(cohort_sizes, axis=0)
### check
tmp = online[['CustomerID','CohortMonth','CohortIndex' ] ]# copy & subset data
tmp_grp = tmp.groupby(['CohortMonth', 'CohortIndex']) # create grouping object
tmp_grp.first() # check
### Calculate average price
# Create a groupby object and pass the monthly cohort and cohort index as a list
grouping = online.groupby(['CohortMonth', 'CohortIndex'])
# Calculate the average of the unit price
cohort_data = grouping['UnitPrice'].mean()
# Reset the index of cohort_data
cohort_data = cohort_data.reset_index()
# Create a pivot
average_quantity = cohort_data.pivot(index='CohortMonth', columns='CohortIndex', values='UnitPrice')
print(average_quantity.round(1))
### Visualize average quantity metric
import matplotlib.pyplot as plt
# Import seaborn package as sns
import seaborn as sns
# Initialize an 8 by 6 inches plot figure
plt.figure(figsize=(8,6))
# Add a title
plt.title('Average Spend by Monthly Cohorts')
# Create the heatmap
sns.heatmap(data=average_quantity, annot=True, cmap='Blues')
plt.show()
# ----------------------------------------------
# Recency, Frequency, Monetary Value analysis
# ---------------------------------------------
# Create a spend quartile with 4 groups - a range between 1 and 5
spend_quartile = pd.qcut(data['Spend'], q=4, labels=range(1,5))
# Assign the quartile values to the Spend_Quartile column in data
data['Spend_Quartile'] = spend_quartile
# Print data with sorted Spend values
print(data.sort_values('Spend'))
### Calculate Recency deciles (q=10)
# Store labels from 4 to 1 in a decreasing order
r_labels = list(range(4, 0, -1))
# Create a spend quartile with 4 groups and pass the previously created labels
recency_quartiles = pd.qcut(data['Recency_Days'], q=4, labels=r_labels)
# Assign the quartile values to the Recency_Quartile column in `data`
data['Recency_Quartile'] = recency_quartiles
# Print `data` with sorted Recency_Days values
print(data.sort_values('Recency_Days'))
### Calculate RFM values
# Calculate Recency, Frequency and Monetary values for the online dataset we have used
# before - it has been loaded for you with recent 12 months of data. There's a TotalSum
# column in the online dataset which has been calculated by multiplying Quantity and
# UnitPrice: online['Quantity'] * online['UnitPrice']
# Also, we have created a snapshot_date variable that you can use to calculate recency.
# Feel free to print the online dataset and the snapshot_date into the Console.
# The pandas library is loaded as pd, and datetime as dt
# read data
online12m = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_2/online12M.csv')
online12m.info() # check data formats
# Convert df['date'] from string to datetime
# Prepare a format string: time_format
time_format = '%Y-%m-%d %H:%M'
# Convert date_list into a datetime object
online12m['InvoiceDate'] = pd.to_datetime(online12m['InvoiceDate'], format=time_format)
online.info() # check
# create total sum metric
online12m['TotalSum'] = online12m['Quantity'] * online12m['UnitPrice']
# add snapshot date - most recent date in this historical file + 1
online12m['snapshot_date'] = max(online12m.InvoiceDate) + dt.timedelta(days=1)
# create snapshot date value
snapshot_date = max(online12m.InvoiceDate) + dt.timedelta(days=1)
# Calculate Recency, Frequency and Monetary value for each customer
datamart = online12m.groupby(['CustomerID']).agg({
'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
'InvoiceNo': 'count',
'TotalSum': 'sum'})
# Rename the columns
datamart.rename(columns={'InvoiceDate': 'Recency',
'InvoiceNo': 'Frequency',
'TotalSum': 'MonetaryValue'}, inplace=True)
# Print top 5 rows
print(datamart.head())
### Calculate 3 groups for Recency and Frequency
# Create labels for Recency and Frequency
r_labels = range(3,0,-1); f_labels = range(1,4)
# Assign these labels to three equal percentile groups
r_groups = pd.qcut(datamart['Recency'], q=3, labels=r_labels)
# Assign these labels to three equal percentile groups
f_groups = pd.qcut(datamart['Frequency'], q=3, labels=f_labels)
# Create new columns R and F
datamart = datamart.assign(R = r_groups.values, F = f_groups.values)
### Calculate RFM Score
# Create labels for MonetaryValue
m_labels = range(1,4)
# Assign these labels to three equal percentile groups
m_groups = pd.qcut(datamart['MonetaryValue'], q=3, labels=m_labels)
# Create new column M
datamart = datamart.assign(M = m_groups.values)
# Calculate RFM_Score
datamart['RFM_Score'] = datamart[['R','F','M']].sum(axis=1)
print(datamart['RFM_Score'].head())
### Concatenate RFM quartile values to RFM_Segment
# define function to concatenate RFM values
def join_rfm(x): return str(x['R']) + str(x['F']) + str(x['M'])
datamart['RFM_Segment'] = datamart.apply(join_rfm, axis=1)
### explore segments
# look at largest segments
datamart.groupby('RFM_Segment').size().sort_values(ascending=False)[:10]
# Select bottom RFM segment "111" and view top 5 rows
datamart[datamart['RFM_Segment']=='111'][:5]
# Summary metrics per RFM Score
datamart.groupby('RFM_Score').agg({
'Recency': 'mean',
'Frequency': 'mean',
'MonetaryValue': ['mean','count'] }).round(1)
### Creating custom segments
# Define rfm_level function
def rfm_level(df):
if df['RFM_Score'] >= 10:
return 'Top'
elif ((df['RFM_Score'] >= 6) and (df['RFM_Score'] < 10)):
return 'Middle'
else:
return 'Low'
# Create a new variable RFM_Level
datamart['RFM_Level'] = datamart.apply(rfm_level, axis=1)
# Print the header with top 5 rows to the console
print(datamart.head())
### Analyzing custom segments
# Calculate average values for each RFM_Level, and return a size of each segment
rfm_level_agg = datamart.groupby('RFM_Level').agg({
'Recency': 'mean',
'Frequency': 'mean',
# Return the size of each segment
'MonetaryValue': ['mean', 'count']
}).round(1)
# Print the aggregated dataset
print(rfm_level_agg)
#--------------------------------------------
# Data pre-processing for clustering
#--------------------------------------------
### Calculate statistics of variables
### examples
# Print the average values of the variables in the dataset
print(datamart.mean())
# Print the standard deviation of the variables in the dataset
print(datamart.std())
# Use `describe` function to get key statistics of the dataset
print(datamart.describe())
### Detect skewed variables
# import seaborn as sns
# from matplotlib import pyplot as plt
sns.distplot(datamart['Recency'])
plt.show()
### examples
# Plot distribution of var1
plt.subplot(3, 1, 1); sns.distplot(data['var1'])
# Plot distribution of var2
plt.subplot(3, 1, 2); sns.distplot(data['var2'])
# Plot distribution of var3
plt.subplot(3, 1, 3); sns.distplot(data['var3'])
# Show the plot
plt.show()
### manage skewness
# example:
# Apply log transformation to var2
data['var2_log'] = np.log(data['var2'])
# Apply log transformation to var3
data['var3_log'] = np.log(data['var3'])
# Create a subplot of the distribution of var2_log
plt.subplot(2, 1, 1); sns.distplot(data['var2_log'])
# Create a subplot of the distribution of var3_log
plt.subplot(2, 1, 2); sns.distplot(data['var3_log'])
# Show the plot
plt.show()
### Center and scale manually
# example:
# Center the data by subtracting average values from each entry
data_centered = data - data.mean()
# Scale the data by dividing each entry by standard deviation
data_scaled = data / data.std()
# Normalize the data by applying both centering and scaling
data_normalized = (data - data.mean()) / data.std()
# Print summary statistics to make sure average is zero and standard deviation is one
print(data_normalized.describe().round(2))
### Center and scale with StandardScaler()
# example:
# Initialize a scaler
scaler = StandardScaler()
# Fit the scaler
scaler.fit(data)
# Scale and center the data
data_normalized = scaler.transform(data)
# Create a pandas DataFrame
data_normalized = pd.DataFrame(data_normalized, index=data.index, columns=data.columns)
# Print summary statistics
print(data_normalized.describe().round(2))
### Visualize RFM distributions
# read data
datamart_rfm = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_3/rfm_datamart.csv')
datamart_rfm .info() # check data formats
datamart_rfm .describe()
# Plot recency distribution
plt.subplot(3, 1, 1); sns.distplot(datamart_rfm['Recency'])
# Plot frequency distribution
plt.subplot(3, 1, 2); sns.distplot(datamart_rfm['Frequency'])
# Plot monetary value distribution
plt.subplot(3, 1, 3); sns.distplot(datamart_rfm['MonetaryValue'])
# Show the plot
plt.show()
### Pre-process RFM data
import numpy as np
from sklearn.preprocessing import StandardScaler
# Unskew the data
datamart_log = np.log(datamart_rfm)
# Initialize a standard scaler and fit it
scaler = StandardScaler()
scaler.fit(datamart_log)
# Scale and center the data
datamart_normalized = scaler.transform(datamart_log)
# Create a pandas DataFrame
datamart_normalized = pd.DataFrame(data=datamart_normalized, index=datamart_rfm.index, columns=datamart_rfm.columns)
### Visualize the normalized variables
# Plot recency distribution
plt.subplot(3, 1, 1); sns.distplot(datamart_normalized['Recency'])
# Plot frequency distribution
plt.subplot(3, 1, 2); sns.distplot(datamart_normalized['Frequency'])
# Plot monetary value distribution
plt.subplot(3, 1, 3); sns.distplot(datamart_normalized['MonetaryValue'])
# Show the plot
plt.show()
#-----------------------------------------
# Customer Segmentation with K-means
#-----------------------------------------
# Import KMeans
from sklearn.cluster import KMeans
# Initialize KMeans
kmeans = KMeans(n_clusters=3, random_state=1)
# Fit k-means clustering on the normalized data set
kmeans.fit(datamart_normalized)
# Extract cluster labels
cluster_labels = kmeans.labels_
### Assign labels to raw data
# Create a DataFrame by adding a new cluster label column
datamart_rfm_k3 = datamart_rfm.assign(Cluster=cluster_labels)
# Group the data by cluster
grouped = datamart_rfm_k3.groupby(['Cluster'])
# Calculate average RFM values and segment sizes per cluster value
grouped.agg({
'Recency': 'mean',
'Frequency': 'mean',
'MonetaryValue': ['mean', 'count']
}).round(1)
### Calculate sum of squared errors
# example:
# Fit KMeans and calculate SSE for each k
for k in range(1, 21):
# Initialize KMeans with k clusters
kmeans = KMeans(n_clusters=k, random_state=1)
# Fit KMeans on the normalized dataset
kmeans.fit(data_normalized)
# Assign sum of squared distances to k element of dictionary
sse[k] = kmeans.inertia_
### Plot sum of squared errors
# example:
# Add the plot title "The Elbow Method"
plt.title('The Elbow Method')
# Add X-axis label "k"
plt.xlabel('k')
# Add Y-axis label "SSE"
plt.ylabel('SSE')
# Plot SSE values for each key in the dictionary
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()
### Prepare data for the snake plot
# example:
# Melt the normalized dataset and reset the index
datamart_melt = pd.melt(datamart_normalized.reset_index(),
# Assign CustomerID and Cluster as ID variables
id_vars=['CustomerID', 'Cluster'],
# Assign RFM values as value variables
value_vars=['Recency', 'Frequency', 'MonetaryValue'],
# Name the variable and value
var_name='Metric', value_name='Value'
)
### Visualize snake plot
# Add the plot title
plt.title('Snake plot of normalized variables')
# Add the x axis label
plt.xlabel('Metric')
# Add the y axis label
plt.ylabel('Value')
# Plot a line for each value of the cluster variable
sns.lineplot(data =datamart_melt, x='Metric', y='Value', hue='Cluster')
plt.show()
### Calculate relative importance
# example:
# Calculate average RFM values for each cluster
cluster_avg = datamart_rfm_k3.groupby(['Cluster']).mean()
# Calculate average RFM values for the total customer population
population_avg = datamart_rfm.mean()
# Calculate relative importance of cluster's attribute value compared to population
relative_imp = cluster_avg / population_avg - 1
# Print relative importance scores rounded to 2 decimals
print(relative_imp.round(2))
### Relative importance heatmap
# Initialize a plot with a figure size of 8 by 2 inches
plt.figure(figsize=(8,2))
# Add the plot title
plt.title('Relative importance of attributes')
# Plot the heatmap
sns.heatmap(data=relative_imp, annot=True,fmt ='.2f',cmap ='RdYlGn')
plt.show()
# ---- End to end customer segmentation solution ----- #
datamart_rfmt = pd.read_csv('/Users/amandabogard/Google Drive/Python Scripts/DC Customer Segmentation/chapter_4/datamart_rfmt.csv')
### Pre-process data
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
# Apply log transformation
datamart_rfmt_log = np.log(datamart_rfmt)
# Initialize StandardScaler and fit it
scaler = StandardScaler(); scaler.fit(datamart_rfmt_log)
# Transform and store the scaled data as datamart_rfmt_normalized
datamart_rfmt_normalized = scaler.transform(datamart_rfmt_log)
### Calculate and plot sum of squared errors
# Fit KMeans and calculate SSE for each k between 1 and 10
for k in range(1, 11):
# Initialize KMeans with k clusters and fit it
kmeans = KMeans(n_clusters=k, random_state=1).fit(datamart_rfmt_normalized)
# Assign sum of squared distances to k element of the sse dictionary
sse[k] = kmeans.inertia_
# Add the plot title, x and y axis labels
plt.title('The Elbow Method'); plt.xlabel('k'); plt.ylabel('SSE')
# Plot SSE values for each k stored as keys in the dictionary
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()
### Build 4-cluster solution
# Import KMeans
from sklearn.cluster import KMeans
# Initialize KMeans
kmeans = KMeans(n_clusters=4,random_state=1)
# Fit k-means clustering on the normalized data set
kmeans.fit(datamart_rfmt_normalized)
# Extract cluster labels
cluster_labels = kmeans.labels_
### Analyze the segments
# Create a new DataFrame by adding a cluster label column to datamart_rfmt
datamart_rfmt_k4 = datamart_rfmt.assign(Cluster=cluster_labels)
# Group by cluster
grouped = datamart_rfmt_k4.groupby(['Cluster'])
# Calculate average RFMT values and segment sizes for each cluster
grouped.agg({
'Recency': 'mean',
'Frequency': 'mean',
'MonetaryValue': 'mean',
'Tenure': ['mean', 'count']
}).round(1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment