Skip to content

Instantly share code, notes, and snippets.

@audhiaprilliant
Last active March 7, 2021 08:11
Show Gist options
  • Select an option

  • Save audhiaprilliant/b0841241b6b0ac8c80bbfcac9a459540 to your computer and use it in GitHub Desktop.

Select an option

Save audhiaprilliant/b0841241b6b0ac8c80bbfcac9a459540 to your computer and use it in GitHub Desktop.
Cluster Ensemble - Data Manipulation
# Install pakcage for cluster ensemble
install.packages('diceR')
install.packages('treemapify')
library(diceR)
library(dplyr)
library(ggplot2)
library(treemapify)
# Load the order data
df_orders = read.csv(file = '../data/olist_orders_dataset.csv', header = TRUE, sep = ',')
str(df_orders)
dim(df_orders)
# Load the payment data
df_payments = read.csv(file = '../data/olist_order_payments_dataset.csv', header = TRUE, sep = ',')
str(df_payments)
dim(df_payments)
# Covert the datetime
cols_date = c('order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date',
'order_delivered_customer_date', 'order_estimated_delivery_date')
for (index in 1:length(cols_date)) {
df_orders[,cols_date[index]] = as.POSIXct(df_orders[,cols_date[index]], format = '%Y-%m-%d %H:%M:%S', tz = Sys.timezone())
}
lapply(df_orders[, cols_date], summary)
# Validate the missing value
sapply(df_orders, function(x) sum(is.na(x)))
sapply(df_payments, function(x) sum(is.na(x)))
# Check the data duplidation
dim(df_orders[duplicated(df_orders),])
dim(df_payments[duplicated(df_payments),])
# Merge the data
df_merged = merge(x = df_orders, y = df_payments , by = 'order_id', all.x = TRUE)
str(df_merged)
dim(df_merged)
# Create a RFM data
snapshot_date = max(df_merged[,'order_purchase_timestamp']) + (24 * 60 * 60)
rfm = df_merged %>%
group_by(customer_id) %>%
summarize(
Recency = (snapshot_date - max(order_purchase_timestamp)),
Frequency = length(order_id),
#Frequency = count(order_id, na.rm = TRUE),
Monetary = sum(payment_value, na.rm = TRUE)
)
str(rfm)
dim(rfm)
head(rfm)
# Manipulate the data type
rfm$Recency = as.numeric(rfm$Recency)
rfm$Frequency = as.numeric(rfm$Frequency)
rfm$Monetary = as.numeric(rfm$Monetary)
# Manipulate the RFM
rfm$R = cut(rfm$Recency, 4, labels = FALSE)
rfm$F = cut(rfm$Frequency, 4, labels = FALSE)
rfm$M = cut(rfm$Monetary, 4, labels = FALSE)
# Create a customer segment
rfm$`RFM Segment` = paste(rfm$R, rfm$F, rfm$M, sep = '')
# Count number of unique segments
length(unique(rfm$`RFM Segment`))
rfm %>%
group_by(`RFM Segment`) %>%
summarize(
Count = n()
)
# Calculate the RFM score
rfm$`RFM Score` = rfm$R + rfm$F + rfm$M
head(rfm[,c('customer_id', 'RFM Score')])
# Segmentation using RFM Analysis or identify Upselling and Cross selling opportunities
rfm$`RFM Segment` = as.integer(rfm$`RFM Segment`)
# Customer segment
for (index in 1:dim(rfm)[1]) {
if (rfm[index,'RFM Segment'] >= 434 | (rfm[index,'RFM Score'] >= 9)) {
rfm[index,'Customer Segment'] = 'VVIP - Cant Loose Them'
}
else
if ((rfm[index,'RFM Score'] >= 8) & (rfm[index,'M'] == 4)) {
rfm[index,'Customer Segment'] = 'Champions Big Spenders'
}
else
if ((rfm[index,'RFM Score'] >= 6) & (rfm[index,'F'] >= 2)) {
rfm[index,'Customer Segment'] = 'Loyal Customers'
}
else
if ((rfm[index,'RFM Segment'] >= 221) | (rfm[index,'RFM Score'] >= 6)) {
rfm[index,'Customer Segment'] = 'Potential Loyalists'
}
else
if (((rfm[index,'RFM Segment'] >= 121) & (rfm[index,'R'] == 1)) | rfm[index,'RFM Score'] == 5) {
rfm[index,'Customer Segment'] = 'Needs Attention'
}
else
if ((rfm[index,'RFM Score'] >= 4) & (rfm[index,'R'] == 1)) {
rfm[index,'Customer Segment'] = 'Hibernating - Almost Lost'
}
else {
rfm[index,'Customer Segment'] = 'Lost Customers'
}
}
# Marketing action
for (index in 1:dim(rfm)[1]) {
if (rfm[index,'RFM Segment'] >= 434 | (rfm[index,'RFM Score'] >= 9)) {
rfm[index,'Marketing Action'] = 'No Price Incentives; Offer Limited edition and Loyality programs'
}
else
if ((rfm[index,'RFM Score'] >= 8) & (rfm[index,'M'] == 4)) {
rfm[index,'Marketing Action'] = 'Upsell most expensive items'
}
else
if ((rfm[index,'RFM Score'] >= 6) & (rfm[index,'F'] >= 2)) {
rfm[index,'Marketing Action'] = 'Loyality programs;Cross Sell'
}
else
if ((rfm[index,'RFM Segment'] >= 221) | (rfm[index,'RFM Score'] >= 6)) {
rfm[index,'Marketing Action'] = 'Cross Sell Recommendations and Discount coupons'
}
else
if (((rfm[index,'RFM Segment'] >= 121) & (rfm[index,'R'] == 1)) | rfm[index,'RFM Score'] == 5) {
rfm[index,'Marketing Action'] = 'Price incentives and Limited time offer'
}
else
if ((rfm[index,'RFM Score'] >= 4) & (rfm[index,'R'] == 1)) {
rfm[index,'Marketing Action'] = 'Aggressive price incentives'
}
else {
rfm[index,'Marketing Action'] = 'Dont spend too much trying to re-acquire'
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment