Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save OrenBochman/9a74861dce17963d5611 to your computer and use it in GitHub Desktop.
Save OrenBochman/9a74861dce17963d5611 to your computer and use it in GitHub Desktop.
Uses the BTYD package and Pareto/NBD model to predict customer behavior in R Slides are at: http://www.slideshare.net/mattbagg/baggott-predict-customerinrpart1#
#
# PREDICTING LONG TERM CUSTOMER VALUE WITH BTYD PACKAGE
# Pareto/NBD (negative binomial distribution) modeling of
# repeat-buying behavior in a noncontractual setting
#
# Matthew Baggott, [email protected]
#
# Accompanying slides at:
# http://www.slideshare.net/mattbagg/baggott-predict-customerinrpart1#
#
# Based on Schmittlein, Morrison, and Colombo (1987), “Counting Your Customers:
# Who Are They and What Will They Do Next?” Management Science, 33, 1–24.
#
# Required data for model is :
#
# "customer-by-sufficient-statistic” (cbs) matrix
# with the 'sufficient' stats being:
# frequency of transaction
# recency (time of last transaction) and
# total time observed
# Main model params are :
# beta unobserved shape parameter for dropout process
# s unobserved scale parameter for dropout process
# r unobserved shape parameter for NBD transaction
# alpha unobserved scale parameter for NBD transaction
# Data are divided into earlier calibration and later holdout segments,
# using a single date as the cut point. "cal" data are used
# to predict "holdout" data
#
############################################
############################################
#
# INSTALL AND LOAD NEEDED PACKAGES
#
############################################
toInstallCandidates <- c("ggplot2", "BTYD", "reshape2", "plyr", "lubridate")
# check if pkgs are already present
toInstall <- toInstallCandidates[!toInstallCandidates%in%library()$results[,1]]
if(length(toInstall)!=0)
{install.packages(toInstall, repos = "http://cran.r-project.org")}
# load pkgs
lapply(toInstallCandidates, library, character.only = TRUE)
############################################
#
# LOAD DATA
#
############################################
# Data are 10% of the cohort of customers who made their first transactions
# with online retailer CDNOW (founded 1994) in the first quarter of 1997.
cdnowElog <- system.file("data/cdnowElog.csv", package = "BTYD")
elog=read.csv(cdnowElog) # read data
head(elog) # take a look
elog<-elog[,c(2,3,5)] # we need these columns
names(elog) <- c("cust","date","sales") # model functions expect these names
# format date
elog$date <- as.Date(as.character(elog$date), format="%Y%m%d")
# Transaction-flow models, such as the Pareto/NBD, are concerned
# with interpurchase intervals.
# Since we only have dates and there may be multiple purchases on a day
# we merge all transactions that occurred on the same day
# using dc.MergeTransactionsOnSameDate()
elog <- dc.MergeTransactionsOnSameDate(elog)
head(elog)
summary(elog) # no NAs
############################################
#
# EXAMINE DATA
#
############################################
# make log plot and plot sales
ggplot(elog, aes(x=date,y=sales,group=cust))+
geom_line(alpha=0.1)+
scale_x_date()+
scale_y_log10()+
ggtitle("Sales for individual customers")+
ylab("Sales ($, US)")+xlab("")+
theme_minimal()
# look at days between orders
# model describes rates via a gamma distribution across customers
purchaseFreq <- ddply(elog, .(cust), summarize,
daysBetween = as.numeric(diff(date)))
windows();ggplot(purchaseFreq,aes(x=daysBetween))+
geom_histogram(fill="orange")+
xlab("Time between purchases (days)")+
theme_minimal()
#(fitg<-fitdist(daysBetween$daysBetween,"gamma",method="mme"))
#windows();plot(fitg)
############################################
#
# DIVIDE DATA
#
############################################
# into a calibration phase
# and a holdout phase
# determine middle point for splitting
(end.of.cal.period <-
min(elog$date)+as.numeric((max(elog$date)-min(elog$date))/2))
# split data into train(calibration) and test (holdout) and make matrices
data <- dc.ElogToCbsCbt(elog, per="week",
T.cal=end.of.cal.period,
merge.same.date=TRUE, # not needed, we already did it
statistic = "freq") # which CBT to return
# take a look
str(data)
# cbs is short for "customer-by-sufficient-statistic” matrix
# with the sufficient stats being:
# frequency
# recency (time of last transaction) and
# total time observed
# extract calibration matrix
cal2.cbs <- as.matrix(data[[1]][[1]])
str(cal2.cbs)
############################################
#
# ESTIMATE PARAMETERS FOR MODEL
#
############################################
# initial estimate
(params2 <- pnbd.EstimateParameters(cal2.cbs))
# look at log likelihood
(LL <- pnbd.cbs.LL(params2, cal2.cbs))
# make a series of estimates, see if they converge
p.matrix <- c(params2, LL)
for (i in 1:20) {
params2 <- pnbd.EstimateParameters(cal2.cbs, params2)
LL <- pnbd.cbs.LL(params2, cal2.cbs)
p.matrix.row <- c(params2, LL)
p.matrix <- rbind(p.matrix, p.matrix.row)
}
# examine
p.matrix
# use final set of values
(params2 <- p.matrix[dim(p.matrix)[1],1:4])
# Main model params are :
# r gamma parameter for NBD transaction
# alpha gamma parameter for NBD transaction
# s gamma parameter for Pareto (exponential gamma) dropout process
# beta gammma parameter for Pareto (exponential gamma) dropout process
############################################
#
# PLOT LOG-LIKELIHOOD ISO-CONTOURS FOR MAIN PARAMS
#
############################################
# set up parameter names for a more descriptive result
param.names <- c("r", "alpha", "s", "beta")
LL <- pnbd.cbs.LL(params2, cal2.cbs)
dc.PlotLogLikelihoodContours(pnbd.cbs.LL, params2, cal.cbs = cal2.cbs , n.divs = 5,
num.contour.lines = 7, zoom.percent = 0.3,
allow.neg.params = FALSE, param.names = param.names)
############################################
#
# PLOT GROUP DISTRIBUTION OF PROPENSITY TO PURCHASE, DROPOUT
#
############################################
# par to make two plots side by side
par(mfrow=c(1,2))
# Plot the estimated distribution of lambda
# (customers' propensities to purchase)
pnbd.PlotTransactionRateHeterogeneity(params2, lim = NULL)
# lim is upper xlim
# Plot estimated distribution of gamma
# (customers' propensities to drop out).
pnbd.PlotDropoutRateHeterogeneity(params2)
# set par to normal
par(mfrow=c(1,1))
############################################
#
# EXAMINE INDIVIDUAL PREDICTIONS
#
############################################
# estimate number transactions a new customer
# will make in 52 weeks
pnbd.Expectation(params2, t = 52)
# expected characteristics for a specific individual,
# conditional on their purchasing behavior during calibration
# calibration data for customer 1516
# frequency("x"), recency("t.x") and total time observed("T.cal")
cal2.cbs["1516",]
x <- cal2.cbs["1516", "x"] # x is frequency
t.x <- cal2.cbs["1516", "t.x"] # t.x is recency, ie time of last transactions
T.cal <- cal2.cbs["1516", "T.cal"] # T.cal is total time observed
# estimate transactions in a T.star-long duration for that cust
pnbd.ConditionalExpectedTransactions(params2, T.star = 52, # weeks
x, t.x, T.cal)
############################################
#
# PROBABILITY A CUSTOMER IS ALIVE AT END OF CALIBRATION / TRAINING
#
############################################
x # freq of purchase
t.x # week of last purchase
T.cal <- 39 # week of end of cal, i.e. present
pnbd.PAlive(params2, x, t.x, T.cal)
# To visualize the distribution of P(Alive) across customers:
params3 <- pnbd.EstimateParameters(cal2.cbs)
p.alives <- pnbd.PAlive(params3, cal2.cbs[,"x"], cal2.cbs[,"t.x"], cal2.cbs[,"T.cal"])
ggplot(as.data.frame(p.alives),aes(x=p.alives))+
geom_histogram(colour="grey",fill="orange")+
ylab("Number of Customers")+
xlab("Probability Customer is 'Live'")+
theme_minimal()
# plot actual & expected customers binned by num of repeat transactions
pnbd.PlotFrequencyInCalibration(params2, cal2.cbs,
censor=10, title="Model vs. Reality during Calibration")
############################################
#
# HOW WELL DOES MODEL DO IN HOLDOUT PERIOD?
#
############################################
# get holdout transactions from dataframe data, add in as x.star
x.star <- data[[2]][[2]][,1]
cal2.cbs <- cbind(cal2.cbs, x.star)
str(cal2.cbs)
holdoutdates <- attributes(data[[2]][[1]])[[2]][[2]]
holdoutlength <- round(as.numeric(max(as.Date(holdoutdates))-
min(as.Date(holdoutdates)))/7)
# plot predicted vs seen conditional freqs and get matrix 'comp' w values
T.star <- holdoutlength
censor <- 10 # Bin all order numbers here and above
comp <- pnbd.PlotFreqVsConditionalExpectedFrequency(params2, T.star,
cal2.cbs, x.star, censor)
rownames(comp) <- c("act", "exp", "bin")
comp
# plot predicted vs actual by week
# get data without first transaction, this removes those who buy 1x
removedFirst.elog <- dc.SplitUpElogForRepeatTrans(elog)$repeat.trans.elog
removedFirst.cbt <- dc.CreateFreqCBT(removedFirst.elog)
# get all data, so we have customers who buy 1x
allCust.cbt <- dc.CreateFreqCBT(elog)
# add 1x customers into matrix
tot.cbt <- dc.MergeCustomers(data.correct=allCust.cbt,
data.to.correct=removedFirst.cbt)
lengthInDays <- as.numeric(max(as.Date(colnames(tot.cbt)))-
min(as.Date(colnames(tot.cbt))))
origin <- min(as.Date(colnames(tot.cbt)))
tot.cbt.df <- melt(tot.cbt,varnames=c("cust","date"),value.name="Freq")
tot.cbt.df$date<-as.Date(tot.cbt.df$date)
tot.cbt.df$week<-as.numeric(1+floor((tot.cbt.df$date-origin+1)/7))
transactByDay <- ddply(tot.cbt.df,.(date),summarize,sum(Freq))
transactByWeek <- ddply(tot.cbt.df,.(week),summarize,sum(Freq))
names(transactByWeek) <- c("week","Transactions")
names(transactByDay) <- c("date","Transactions")
T.cal <- cal2.cbs[,"T.cal"]
T.tot <- 78 # end of holdout
comparisonByWeek <- pnbd.PlotTrackingInc(params2, T.cal,
T.tot, actual.inc.tracking.data=transactByWeek$Transactions)
############################################
#
# FORMAL MEASURE OF ACCURACY
#
############################################
# root mean squared error
rmse <- function(est, act) { return(sqrt(mean((est-act)^2))) }
# mean squared logarithmic error
msle <- function(est, act) { return(mean((log1p(est)-log1p(act))^2)) }
str(cal2.cbs)
cal2.cbs[,"x"]
predict<-pnbd.ConditionalExpectedTransactions(params2, T.star = 38, # weeks
x = cal2.cbs[,"x"],
t.x = cal2.cbs[,"t.x"],
T.cal = cal2.cbs[,"T.cal"])
cal2.cbs[,"x.star"] # actual transactions for each person
rmse(act=cal2.cbs[,"x.star"],est=predict)
msle(act=cal2.cbs[,"x.star"],est=predict)
# not useful w/o comparison: is this better than guessing?
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment