Last active
March 7, 2021 08:11
-
-
Save audhiaprilliant/b0841241b6b0ac8c80bbfcac9a459540 to your computer and use it in GitHub Desktop.
Cluster Ensemble - Data Manipulation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Install pakcage for cluster ensemble | |
| install.packages('diceR') | |
| install.packages('treemapify') | |
| library(diceR) | |
| library(dplyr) | |
| library(ggplot2) | |
| library(treemapify) | |
| # Load the order data | |
| df_orders = read.csv(file = '../data/olist_orders_dataset.csv', header = TRUE, sep = ',') | |
| str(df_orders) | |
| dim(df_orders) | |
| # Load the payment data | |
| df_payments = read.csv(file = '../data/olist_order_payments_dataset.csv', header = TRUE, sep = ',') | |
| str(df_payments) | |
| dim(df_payments) | |
| # Covert the datetime | |
| cols_date = c('order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', | |
| 'order_delivered_customer_date', 'order_estimated_delivery_date') | |
| for (index in 1:length(cols_date)) { | |
| df_orders[,cols_date[index]] = as.POSIXct(df_orders[,cols_date[index]], format = '%Y-%m-%d %H:%M:%S', tz = Sys.timezone()) | |
| } | |
| lapply(df_orders[, cols_date], summary) | |
| # Validate the missing value | |
| sapply(df_orders, function(x) sum(is.na(x))) | |
| sapply(df_payments, function(x) sum(is.na(x))) | |
| # Check the data duplidation | |
| dim(df_orders[duplicated(df_orders),]) | |
| dim(df_payments[duplicated(df_payments),]) | |
| # Merge the data | |
| df_merged = merge(x = df_orders, y = df_payments , by = 'order_id', all.x = TRUE) | |
| str(df_merged) | |
| dim(df_merged) | |
| # Create a RFM data | |
| snapshot_date = max(df_merged[,'order_purchase_timestamp']) + (24 * 60 * 60) | |
| rfm = df_merged %>% | |
| group_by(customer_id) %>% | |
| summarize( | |
| Recency = (snapshot_date - max(order_purchase_timestamp)), | |
| Frequency = length(order_id), | |
| #Frequency = count(order_id, na.rm = TRUE), | |
| Monetary = sum(payment_value, na.rm = TRUE) | |
| ) | |
| str(rfm) | |
| dim(rfm) | |
| head(rfm) | |
| # Manipulate the data type | |
| rfm$Recency = as.numeric(rfm$Recency) | |
| rfm$Frequency = as.numeric(rfm$Frequency) | |
| rfm$Monetary = as.numeric(rfm$Monetary) | |
| # Manipulate the RFM | |
| rfm$R = cut(rfm$Recency, 4, labels = FALSE) | |
| rfm$F = cut(rfm$Frequency, 4, labels = FALSE) | |
| rfm$M = cut(rfm$Monetary, 4, labels = FALSE) | |
| # Create a customer segment | |
| rfm$`RFM Segment` = paste(rfm$R, rfm$F, rfm$M, sep = '') | |
| # Count number of unique segments | |
| length(unique(rfm$`RFM Segment`)) | |
| rfm %>% | |
| group_by(`RFM Segment`) %>% | |
| summarize( | |
| Count = n() | |
| ) | |
| # Calculate the RFM score | |
| rfm$`RFM Score` = rfm$R + rfm$F + rfm$M | |
| head(rfm[,c('customer_id', 'RFM Score')]) | |
| # Segmentation using RFM Analysis or identify Upselling and Cross selling opportunities | |
| rfm$`RFM Segment` = as.integer(rfm$`RFM Segment`) | |
| # Customer segment | |
| for (index in 1:dim(rfm)[1]) { | |
| if (rfm[index,'RFM Segment'] >= 434 | (rfm[index,'RFM Score'] >= 9)) { | |
| rfm[index,'Customer Segment'] = 'VVIP - Cant Loose Them' | |
| } | |
| else | |
| if ((rfm[index,'RFM Score'] >= 8) & (rfm[index,'M'] == 4)) { | |
| rfm[index,'Customer Segment'] = 'Champions Big Spenders' | |
| } | |
| else | |
| if ((rfm[index,'RFM Score'] >= 6) & (rfm[index,'F'] >= 2)) { | |
| rfm[index,'Customer Segment'] = 'Loyal Customers' | |
| } | |
| else | |
| if ((rfm[index,'RFM Segment'] >= 221) | (rfm[index,'RFM Score'] >= 6)) { | |
| rfm[index,'Customer Segment'] = 'Potential Loyalists' | |
| } | |
| else | |
| if (((rfm[index,'RFM Segment'] >= 121) & (rfm[index,'R'] == 1)) | rfm[index,'RFM Score'] == 5) { | |
| rfm[index,'Customer Segment'] = 'Needs Attention' | |
| } | |
| else | |
| if ((rfm[index,'RFM Score'] >= 4) & (rfm[index,'R'] == 1)) { | |
| rfm[index,'Customer Segment'] = 'Hibernating - Almost Lost' | |
| } | |
| else { | |
| rfm[index,'Customer Segment'] = 'Lost Customers' | |
| } | |
| } | |
| # Marketing action | |
| for (index in 1:dim(rfm)[1]) { | |
| if (rfm[index,'RFM Segment'] >= 434 | (rfm[index,'RFM Score'] >= 9)) { | |
| rfm[index,'Marketing Action'] = 'No Price Incentives; Offer Limited edition and Loyality programs' | |
| } | |
| else | |
| if ((rfm[index,'RFM Score'] >= 8) & (rfm[index,'M'] == 4)) { | |
| rfm[index,'Marketing Action'] = 'Upsell most expensive items' | |
| } | |
| else | |
| if ((rfm[index,'RFM Score'] >= 6) & (rfm[index,'F'] >= 2)) { | |
| rfm[index,'Marketing Action'] = 'Loyality programs;Cross Sell' | |
| } | |
| else | |
| if ((rfm[index,'RFM Segment'] >= 221) | (rfm[index,'RFM Score'] >= 6)) { | |
| rfm[index,'Marketing Action'] = 'Cross Sell Recommendations and Discount coupons' | |
| } | |
| else | |
| if (((rfm[index,'RFM Segment'] >= 121) & (rfm[index,'R'] == 1)) | rfm[index,'RFM Score'] == 5) { | |
| rfm[index,'Marketing Action'] = 'Price incentives and Limited time offer' | |
| } | |
| else | |
| if ((rfm[index,'RFM Score'] >= 4) & (rfm[index,'R'] == 1)) { | |
| rfm[index,'Marketing Action'] = 'Aggressive price incentives' | |
| } | |
| else { | |
| rfm[index,'Marketing Action'] = 'Dont spend too much trying to re-acquire' | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment