Forked from IronistM/predicting_customer_behav_1.R
Created
February 14, 2016 15:55
-
-
Save OrenBochman/aa8ae496ab993e37885a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# PREDICTING LONG TERM CUSTOMER VALUE WITH BTYD PACKAGE | |
# Pareto/NBD (negative binomial distribution) modeling of | |
# repeat-buying behavior in a noncontractual setting | |
# | |
# Matthew Baggott, [email protected] | |
# | |
# Accompanying slides at: | |
# http://www.slideshare.net/mattbagg/baggott-predict-customerinrpart1# | |
# | |
# Based on Schmittlein, Morrison, and Colombo (1987), “Counting Your Customers: | |
# Who Are They and What Will They Do Next?” Management Science, 33, 1–24. | |
# | |
# Required data for model is : | |
# | |
# "customer-by-sufficient-statistic” (cbs) matrix | |
# with the 'sufficient' stats being: | |
# frequency of transaction | |
# recency (time of last transaction) and | |
# total time observed | |
# Main model params are : | |
# beta unobserved shape parameter for dropout process | |
# s unobserved scale parameter for dropout process | |
# r unobserved shape parameter for NBD transaction | |
# alpha unobserved scale parameter for NBD transaction | |
# Data are divided into earlier calibration and later holdout segments, | |
# using a single date as the cut point. "cal" data are used | |
# to predict "holdout" data | |
# | |
############################################ | |
############################################ | |
# | |
# INSTALL AND LOAD NEEDED PACKAGES | |
# | |
############################################ | |
toInstallCandidates <- c("ggplot2", "BTYD", "reshape2", "plyr", "lubridate") | |
# check if pkgs are already present | |
toInstall <- toInstallCandidates[!toInstallCandidates%in%library()$results[,1]] | |
if(length(toInstall)!=0) | |
{install.packages(toInstall, repos = "http://cran.r-project.org")} | |
# load pkgs | |
lapply(toInstallCandidates, library, character.only = TRUE) | |
############################################ | |
# | |
# LOAD DATA | |
# | |
############################################ | |
# Data are 10% of the cohort of customers who made their first transactions | |
# with online retailer CDNOW (founded 1994) in the first quarter of 1997. | |
cdnowElog <- system.file("data/cdnowElog.csv", package = "BTYD") | |
elog=read.csv(cdnowElog) # read data | |
head(elog) # take a look | |
elog<-elog[,c(2,3,5)] # we need these columns | |
names(elog) <- c("cust","date","sales") # model functions expect these names | |
# format date | |
elog$date <- as.Date(as.character(elog$date), format="%Y%m%d") | |
# Transaction-flow models, such as the Pareto/NBD, are concerned | |
# with interpurchase intervals. | |
# Since we only have dates and there may be multiple purchases on a day | |
# we merge all transactions that occurred on the same day | |
# using dc.MergeTransactionsOnSameDate() | |
elog <- dc.MergeTransactionsOnSameDate(elog) | |
head(elog) | |
summary(elog) # no NAs | |
############################################ | |
# | |
# EXAMINE DATA | |
# | |
############################################ | |
# make log plot and plot sales | |
ggplot(elog, aes(x=date,y=sales,group=cust))+ | |
geom_line(alpha=0.1)+ | |
scale_x_date()+ | |
scale_y_log10()+ | |
ggtitle("Sales for individual customers")+ | |
ylab("Sales ($, US)")+xlab("")+ | |
theme_minimal() | |
# look at days between orders | |
# model describes rates via a gamma distribution across customers | |
purchaseFreq <- ddply(elog, .(cust), summarize, | |
daysBetween = as.numeric(diff(date))) | |
windows();ggplot(purchaseFreq,aes(x=daysBetween))+ | |
geom_histogram(fill="orange")+ | |
xlab("Time between purchases (days)")+ | |
theme_minimal() | |
#(fitg<-fitdist(daysBetween$daysBetween,"gamma",method="mme")) | |
#windows();plot(fitg) | |
############################################ | |
# | |
# DIVIDE DATA | |
# | |
############################################ | |
# into a calibration phase | |
# and a holdout phase | |
# determine middle point for splitting | |
(end.of.cal.period <- | |
min(elog$date)+as.numeric((max(elog$date)-min(elog$date))/2)) | |
# split data into train(calibration) and test (holdout) and make matrices | |
data <- dc.ElogToCbsCbt(elog, per="week", | |
T.cal=end.of.cal.period, | |
merge.same.date=TRUE, # not needed, we already did it | |
statistic = "freq") # which CBT to return | |
# take a look | |
str(data) | |
# cbs is short for "customer-by-sufficient-statistic” matrix | |
# with the sufficient stats being: | |
# frequency | |
# recency (time of last transaction) and | |
# total time observed | |
# extract calibration matrix | |
cal2.cbs <- as.matrix(data[[1]][[1]]) | |
str(cal2.cbs) | |
############################################ | |
# | |
# ESTIMATE PARAMETERS FOR MODEL | |
# | |
############################################ | |
# initial estimate | |
(params2 <- pnbd.EstimateParameters(cal2.cbs)) | |
# look at log likelihood | |
(LL <- pnbd.cbs.LL(params2, cal2.cbs)) | |
# make a series of estimates, see if they converge | |
p.matrix <- c(params2, LL) | |
for (i in 1:20) { | |
params2 <- pnbd.EstimateParameters(cal2.cbs, params2) | |
LL <- pnbd.cbs.LL(params2, cal2.cbs) | |
p.matrix.row <- c(params2, LL) | |
p.matrix <- rbind(p.matrix, p.matrix.row) | |
} | |
# examine | |
p.matrix | |
# use final set of values | |
(params2 <- p.matrix[dim(p.matrix)[1],1:4]) | |
# Main model params are : | |
# r gamma parameter for NBD transaction | |
# alpha gamma parameter for NBD transaction | |
# s gamma parameter for Pareto (exponential gamma) dropout process | |
# beta gammma parameter for Pareto (exponential gamma) dropout process | |
############################################ | |
# | |
# PLOT LOG-LIKELIHOOD ISO-CONTOURS FOR MAIN PARAMS | |
# | |
############################################ | |
# set up parameter names for a more descriptive result | |
param.names <- c("r", "alpha", "s", "beta") | |
LL <- pnbd.cbs.LL(params2, cal2.cbs) | |
dc.PlotLogLikelihoodContours(pnbd.cbs.LL, params2, cal.cbs = cal2.cbs , n.divs = 5, | |
num.contour.lines = 7, zoom.percent = 0.3, | |
allow.neg.params = FALSE, param.names = param.names) | |
############################################ | |
# | |
# PLOT GROUP DISTRIBUTION OF PROPENSITY TO PURCHASE, DROPOUT | |
# | |
############################################ | |
# par to make two plots side by side | |
par(mfrow=c(1,2)) | |
# Plot the estimated distribution of lambda | |
# (customers' propensities to purchase) | |
pnbd.PlotTransactionRateHeterogeneity(params2, lim = NULL) | |
# lim is upper xlim | |
# Plot estimated distribution of gamma | |
# (customers' propensities to drop out). | |
pnbd.PlotDropoutRateHeterogeneity(params2) | |
# set par to normal | |
par(mfrow=c(1,1)) | |
############################################ | |
# | |
# EXAMINE INDIVIDUAL PREDICTIONS | |
# | |
############################################ | |
# estimate number transactions a new customer | |
# will make in 52 weeks | |
pnbd.Expectation(params2, t = 52) | |
# expected characteristics for a specific individual, | |
# conditional on their purchasing behavior during calibration | |
# calibration data for customer 1516 | |
# frequency("x"), recency("t.x") and total time observed("T.cal") | |
cal2.cbs["1516",] | |
x <- cal2.cbs["1516", "x"] # x is frequency | |
t.x <- cal2.cbs["1516", "t.x"] # t.x is recency, ie time of last transactions | |
T.cal <- cal2.cbs["1516", "T.cal"] # T.cal is total time observed | |
# estimate transactions in a T.star-long duration for that cust | |
pnbd.ConditionalExpectedTransactions(params2, T.star = 52, # weeks | |
x, t.x, T.cal) | |
############################################ | |
# | |
# PROBABILITY A CUSTOMER IS ALIVE AT END OF CALIBRATION / TRAINING | |
# | |
############################################ | |
x # freq of purchase | |
t.x # week of last purchase | |
T.cal <- 39 # week of end of cal, i.e. present | |
pnbd.PAlive(params2, x, t.x, T.cal) | |
# To visualize the distribution of P(Alive) across customers: | |
params3 <- pnbd.EstimateParameters(cal2.cbs) | |
p.alives <- pnbd.PAlive(params3, cal2.cbs[,"x"], cal2.cbs[,"t.x"], cal2.cbs[,"T.cal"]) | |
ggplot(as.data.frame(p.alives),aes(x=p.alives))+ | |
geom_histogram(colour="grey",fill="orange")+ | |
ylab("Number of Customers")+ | |
xlab("Probability Customer is 'Live'")+ | |
theme_minimal() | |
# plot actual & expected customers binned by num of repeat transactions | |
pnbd.PlotFrequencyInCalibration(params2, cal2.cbs, | |
censor=10, title="Model vs. Reality during Calibration") | |
############################################ | |
# | |
# HOW WELL DOES MODEL DO IN HOLDOUT PERIOD? | |
# | |
############################################ | |
# get holdout transactions from dataframe data, add in as x.star | |
x.star <- data[[2]][[2]][,1] | |
cal2.cbs <- cbind(cal2.cbs, x.star) | |
str(cal2.cbs) | |
holdoutdates <- attributes(data[[2]][[1]])[[2]][[2]] | |
holdoutlength <- round(as.numeric(max(as.Date(holdoutdates))- | |
min(as.Date(holdoutdates)))/7) | |
# plot predicted vs seen conditional freqs and get matrix 'comp' w values | |
T.star <- holdoutlength | |
censor <- 10 # Bin all order numbers here and above | |
comp <- pnbd.PlotFreqVsConditionalExpectedFrequency(params2, T.star, | |
cal2.cbs, x.star, censor) | |
rownames(comp) <- c("act", "exp", "bin") | |
comp | |
# plot predicted vs actual by week | |
# get data without first transaction, this removes those who buy 1x | |
removedFirst.elog <- dc.SplitUpElogForRepeatTrans(elog)$repeat.trans.elog | |
removedFirst.cbt <- dc.CreateFreqCBT(removedFirst.elog) | |
# get all data, so we have customers who buy 1x | |
allCust.cbt <- dc.CreateFreqCBT(elog) | |
# add 1x customers into matrix | |
tot.cbt <- dc.MergeCustomers(data.correct=allCust.cbt, | |
data.to.correct=removedFirst.cbt) | |
lengthInDays <- as.numeric(max(as.Date(colnames(tot.cbt)))- | |
min(as.Date(colnames(tot.cbt)))) | |
origin <- min(as.Date(colnames(tot.cbt))) | |
tot.cbt.df <- melt(tot.cbt,varnames=c("cust","date"),value.name="Freq") | |
tot.cbt.df$date<-as.Date(tot.cbt.df$date) | |
tot.cbt.df$week<-as.numeric(1+floor((tot.cbt.df$date-origin+1)/7)) | |
transactByDay <- ddply(tot.cbt.df,.(date),summarize,sum(Freq)) | |
transactByWeek <- ddply(tot.cbt.df,.(week),summarize,sum(Freq)) | |
names(transactByWeek) <- c("week","Transactions") | |
names(transactByDay) <- c("date","Transactions") | |
T.cal <- cal2.cbs[,"T.cal"] | |
T.tot <- 78 # end of holdout | |
comparisonByWeek <- pnbd.PlotTrackingInc(params2, T.cal, | |
T.tot, actual.inc.tracking.data=transactByWeek$Transactions) | |
############################################ | |
# | |
# FORMAL MEASURE OF ACCURACY | |
# | |
############################################ | |
# root mean squared error | |
rmse <- function(est, act) { return(sqrt(mean((est-act)^2))) } | |
# mean squared logarithmic error | |
msle <- function(est, act) { return(mean((log1p(est)-log1p(act))^2)) } | |
str(cal2.cbs) | |
cal2.cbs[,"x"] | |
predict<-pnbd.ConditionalExpectedTransactions(params2, T.star = 38, # weeks | |
x = cal2.cbs[,"x"], | |
t.x = cal2.cbs[,"t.x"], | |
T.cal = cal2.cbs[,"T.cal"]) | |
cal2.cbs[,"x.star"] # actual transactions for each person | |
rmse(act=cal2.cbs[,"x.star"],est=predict) | |
msle(act=cal2.cbs[,"x.star"],est=predict) | |
# not useful w/o comparison: is this better than guessing? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment