Last active
January 21, 2016 14:59
-
-
Save marcovivero/0968807923b70c79862e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Load packages. Install any packages if necessary. | |
library(ggplot2) | |
library(MCMCpack) | |
library(rootSolve) | |
# 2. Extract data. | |
cv = read.csv("<FILE_PATH>/customerValue.csv", header = F)[, 1] | |
pcv = cv[which(cv > 0)] | |
pcvDF = data.frame(pcv) | |
names(pcvDF) = c("pcv") | |
### Data histogram. | |
ggplot(data = pcvDF) + | |
labs( | |
title = "Positive CV Score Histogram", | |
x ="CV Score (Dollars)", | |
y = NULL) + | |
theme(plot.title = element_text(face = "bold", size = 25)) + | |
theme_bw() + | |
geom_histogram( | |
aes(x = pcv), | |
fill = "orange", | |
alpha = 1, | |
colour = "grey", | |
lwd = 0.5, | |
binwidth = 25) + | |
coord_cartesian(xlim = c(0, 1000)) | |
# 3. From blog post, we set p to be 0.9 | |
p = 0.9 | |
# 4. Computing CV Cutoff using empirical quantiles. | |
### Function for computing pth quantile of | |
computeQuantile = function(x) { | |
quantile(x, c(p))[1] | |
} | |
### Compute the pth quantile of positive CV score data: 514.938 | |
pcvQuantile = computeQuantile(pcv) | |
### Function for getting sample of size n with replacement. | |
getSample = function(n) { | |
sample(pcv, n, replace = T) | |
} | |
### Function for computing boostrap sample for a numerical statistic. | |
getBootstrapEstimates = function(data, B, n, func) { | |
samples = numeric(B) | |
for (i in 1:B) { | |
bootstrapSample = getSample(n) | |
samples[i] = func(bootstrapSample) | |
} | |
return(samples) | |
} | |
### Get bootstrap sample for pth quantile estimates using B = 2500, n = 5000. | |
B = 5000 | |
n = 7500 | |
### Generate bootstrap quantile estimates. | |
quantileEstimates = getBootstrapEstimates(pcv, B, n, computeQuantile) | |
### Sorted estimates with actual positive CV score quantile. | |
sortedEstimatesWithPCVQuantile = sort(c(quantileEstimates, pcvQuantile)) | |
### Get estimate mean quantile q with respect to rest of bootstrap estimates. | |
q = which(sortedEstimatesWithPCVQuantile == pcvQuantile)[1]/ (B + 1) | |
upperQuantile = min(q + (0.95 / 2), 1) | |
lowerQuantile = max(q - (0.95 / 2), 0) | |
confidenceInterval = quantile(quantileEstimates, c(lowerQuantile, upperQuantile)) | |
### Confidence interval plot. | |
quantileEstimatesDF = data.frame(quantileEstimates) | |
names(quantileEstimatesDF)[1] = "x" | |
confidenceDF = data.frame( | |
c( | |
rep(confidenceInterval[1], 2), | |
rep(confidenceInterval[2], 2), | |
confidenceInterval[1]), | |
c(0, 1200, 1200, 0, 0)) | |
names(confidenceDF) = c("x", "y") | |
### Create bootstrap estimate plot with shaded confidence region. | |
ggplot() + | |
labs( | |
title = "Empirical Quantile Bootsrap Estimates With Shaded Confidence Set", | |
x ="90% Quantile Estimate", | |
y = NULL) + | |
theme(plot.title = element_text(face = "bold", size = 25)) + | |
theme_bw() + | |
geom_histogram( | |
aes(x = x), | |
data = quantileEstimatesDF, | |
fill = "orange", | |
alpha = 1, | |
colour = "grey", | |
lwd = 0.2, | |
binwidth = 3) + | |
geom_polygon( | |
aes(x = x, y = y), | |
data = confidenceDF, | |
fill = "red", | |
lwd = 0, | |
alpha = 0.5) + | |
geom_vline(xintercept = pcvQuantile, colour = "blue") + | |
coord_cartesian(xlim = c(400, 550)) | |
### Compute quantile using kernel density estimates. | |
densityQuantile = ewcdf(pcv, c(p)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment