marcovivero · January 21, 2016 14:59
diff --git a/cvCutoff.R b/cvCutoff.R
 # 1. Load packages. Install any packages if necessary.
 library(ggplot2)
 library(MCMCpack)
 library(rootSolve)

 # 2. Extract data.
 cv = read.csv("<FILE_PATH>/customerValue.csv", header = F)[, 1]
 pcv = cv[which(cv > 0)]
 pcvDF = data.frame(pcv)
 names(pcvDF) = c("pcv")

 ### Data histogram.
 ggplot(data = pcvDF) + 
  labs(
    title = "Positive CV Score Histogram", 
    x ="CV Score (Dollars)", 
    y = NULL) +
  theme(plot.title = element_text(face = "bold", size = 25)) +
  theme_bw() + 
  geom_histogram(
    aes(x = pcv),
    fill = "orange", 
    alpha = 1, 
    colour = "grey",
    lwd = 0.5,
    binwidth = 25) +
  coord_cartesian(xlim = c(0, 1000))

 # 3. From blog post, we set p to be 0.9
 p = 0.9

 # 4. Computing CV Cutoff using empirical quantiles.

 ### Function for computing pth quantile of 
 computeQuantile = function(x) {
  quantile(x, c(p))[1]
 }

 ### Compute the pth quantile of positive CV score data: 514.938
 pcvQuantile = computeQuantile(pcv)

 ### Function for getting sample of size n with replacement.
 getSample = function(n) {
  sample(pcv, n, replace = T)
 }

 ### Function for computing boostrap sample for a numerical statistic. 
 getBootstrapEstimates = function(data, B, n, func) {
  samples = numeric(B)
  for (i in 1:B) {
    bootstrapSample = getSample(n)
    samples[i] = func(bootstrapSample)
  }
  
  return(samples)
 }

 ### Get bootstrap sample for pth quantile estimates using B = 2500, n = 5000.
 B = 5000
 n = 7500
 ### Generate bootstrap quantile estimates.
 quantileEstimates = getBootstrapEstimates(pcv, B, n, computeQuantile)
 ### Sorted estimates with actual positive CV score quantile.
 sortedEstimatesWithPCVQuantile = sort(c(quantileEstimates, pcvQuantile))
 ### Get estimate mean quantile q with respect to rest of bootstrap estimates.
 q = which(sortedEstimatesWithPCVQuantile == pcvQuantile)[1]/ (B + 1)
 upperQuantile = min(q + (0.95 / 2), 1)
 lowerQuantile = max(q - (0.95 / 2), 0)
 confidenceInterval = quantile(quantileEstimates, c(lowerQuantile, upperQuantile))

 ### Confidence interval plot.
 quantileEstimatesDF = data.frame(quantileEstimates)
 names(quantileEstimatesDF)[1] = "x"
 confidenceDF = data.frame(
  c(
    rep(confidenceInterval[1], 2), 
    rep(confidenceInterval[2], 2), 
    confidenceInterval[1]),
  c(0, 1200, 1200, 0, 0))
 names(confidenceDF) = c("x", "y")

 ### Create bootstrap estimate plot with shaded confidence region.
 ggplot() + 
  labs(
    title = "Empirical Quantile Bootsrap Estimates With Shaded Confidence Set", 
    x ="90% Quantile Estimate", 
    y = NULL) +
  theme(plot.title = element_text(face = "bold", size = 25)) +
  theme_bw() + 
  geom_histogram(
    aes(x = x),
    data = quantileEstimatesDF,
    fill = "orange", 
    alpha = 1, 
    colour = "grey",
    lwd = 0.2,
    binwidth = 3) +
  geom_polygon(
    aes(x = x, y = y),
    data = confidenceDF,
    fill = "red",
    lwd = 0,
    alpha = 0.5) + 
  geom_vline(xintercept = pcvQuantile, colour = "blue") + 
  coord_cartesian(xlim = c(400, 550))

 ### Compute quantile using kernel density estimates.
 densityQuantile = ewcdf(pcv, c(p))
	# 1. Load packages. Install any packages if necessary.
	library(ggplot2)
	library(MCMCpack)
	library(rootSolve)

	# 2. Extract data.
	cv = read.csv("<FILE_PATH>/customerValue.csv", header = F)[, 1]
	pcv = cv[which(cv > 0)]
	pcvDF = data.frame(pcv)
	names(pcvDF) = c("pcv")

	### Data histogram.
	ggplot(data = pcvDF) +
	labs(
	title = "Positive CV Score Histogram",
	x ="CV Score (Dollars)",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = pcv),
	fill = "orange",
	alpha = 1,
	colour = "grey",
	lwd = 0.5,
	binwidth = 25) +
	coord_cartesian(xlim = c(0, 1000))

	# 3. From blog post, we set p to be 0.9
	p = 0.9

	# 4. Computing CV Cutoff using empirical quantiles.

	### Function for computing pth quantile of
	computeQuantile = function(x) {
	quantile(x, c(p))[1]
	}

	### Compute the pth quantile of positive CV score data: 514.938
	pcvQuantile = computeQuantile(pcv)

	### Function for getting sample of size n with replacement.
	getSample = function(n) {
	sample(pcv, n, replace = T)
	}

	### Function for computing boostrap sample for a numerical statistic.
	getBootstrapEstimates = function(data, B, n, func) {
	samples = numeric(B)
	for (i in 1:B) {
	bootstrapSample = getSample(n)
	samples[i] = func(bootstrapSample)
	}

	return(samples)
	}

	### Get bootstrap sample for pth quantile estimates using B = 2500, n = 5000.
	B = 5000
	n = 7500
	### Generate bootstrap quantile estimates.
	quantileEstimates = getBootstrapEstimates(pcv, B, n, computeQuantile)
	### Sorted estimates with actual positive CV score quantile.
	sortedEstimatesWithPCVQuantile = sort(c(quantileEstimates, pcvQuantile))
	### Get estimate mean quantile q with respect to rest of bootstrap estimates.
	q = which(sortedEstimatesWithPCVQuantile == pcvQuantile)[1]/ (B + 1)
	upperQuantile = min(q + (0.95 / 2), 1)
	lowerQuantile = max(q - (0.95 / 2), 0)
	confidenceInterval = quantile(quantileEstimates, c(lowerQuantile, upperQuantile))

	### Confidence interval plot.
	quantileEstimatesDF = data.frame(quantileEstimates)
	names(quantileEstimatesDF)[1] = "x"
	confidenceDF = data.frame(
	c(
	rep(confidenceInterval[1], 2),
	rep(confidenceInterval[2], 2),
	confidenceInterval[1]),
	c(0, 1200, 1200, 0, 0))
	names(confidenceDF) = c("x", "y")

	### Create bootstrap estimate plot with shaded confidence region.
	ggplot() +
	labs(
	title = "Empirical Quantile Bootsrap Estimates With Shaded Confidence Set",
	x ="90% Quantile Estimate",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = x),
	data = quantileEstimatesDF,
	fill = "orange",
	alpha = 1,
	colour = "grey",
	lwd = 0.2,
	binwidth = 3) +
	geom_polygon(
	aes(x = x, y = y),
	data = confidenceDF,
	fill = "red",
	lwd = 0,
	alpha = 0.5) +
	geom_vline(xintercept = pcvQuantile, colour = "blue") +
	coord_cartesian(xlim = c(400, 550))

	### Compute quantile using kernel density estimates.
	densityQuantile = ewcdf(pcv, c(p))