Last active
December 18, 2015 12:09
-
-
Save bohdanszymanik/5780368 to your computer and use it in GitHub Desktop.
Example usage of R data.table in an incident analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data was closed incidents in a tab separated txt file that looked like this | |
# week category P1 P2 P3 | |
# sucked up into R it formed a data frame like so | |
# 1 28/05/2012 some category1 NA NA 1 | |
# 2 28/05/2012 some category2 NA 2 1 | |
# 3 28/05/2012 some category3 1 NA 1 | |
# 4 28/05/2012 some category4 NA NA 2 | |
# as it happens, this data was manually collected from excel spreadsheets | |
# unfortunately the spreadsheets were for differing week ranges and there was some overlap with | |
# occasionally differing values for P1, P2, P3 | |
# | |
# the best approach here would be to take the last (ie latest) values for a given week and category | |
# because these would have been the most accurate (containing all those recently closed) | |
# the data frame was ordered by recency of the data so this could potentially work... | |
# otherwise a rough way was just to average the data:) | |
# | |
# how about we try both and check they roughly compare... | |
# start by loading up a data.frame | |
incidents <- read.delim("C:/Temp/incidents1.txt", header=F) | |
View(incidents) | |
colnames(incidents) <- c("week", "category", "P1", "P2", "P3") | |
nrow(incidents) | |
nrow(unique(incidents)) | |
# 2587 rows in either case above | |
# plot out data - to do this we need to reshape so that P1, P2, P3 columns become two columns | |
# one being the type of the incident, the other the count of that incident type | |
library(reshape) | |
incidentsMelt <- melt(incidents[,c("week", "category", "P1", "P2", "P3")],id=c("week", "category")) | |
library(ggplot2) | |
qplot(as.Date(week,"%d/%m/%Y"), value, data=incidentsMelt, colour = variable) | |
# time to sort this data out | |
library(data.table) | |
incidentsDT <- data.table(incidents) | |
setkey(incidentsDT, week, category) | |
# data.table gives speedy queries when data > 100k rows (not thecase here) | |
# fast binary scan across keyed data - notice the J to denote a select | |
incidentsDT[J("28/05/2012", "some categoryA")] | |
# vector search, but it provides wildcard querying and regex | |
incidentsDT[(week=="28/05/2012" & category %like% "Application Delvry")] | |
# | |
# let's try the mean approach to dealing with repeated, but differing, entries | |
# | |
incidentsDT2 <- incidentsDT | |
incidentsDT2[, MeanP1:=mean(P1), by=list(week, category)] | |
incidentsDT2[, MeanP2:=mean(P2), by=list(week, category)] | |
incidentsDT2[, MeanP3:=mean(P3), by=list(week, category)] | |
head(incidentsDT2) | |
nrow(incidentsDT2) # 2587 rows | |
tables() | |
sapply(incidentsDT2, class) | |
# selecting columns of a data frame | |
head(incidents[,c('week', 'category')]) | |
# selecting columns of a data table - use list instead of c | |
tail(incidentsDT2[, list(week, category)]) | |
nrow(unique(incidentsDT2[, list(week, category, MeanP1, MeanP2, MeanP3)])) # 2381 rows | |
# | |
# let's try a data.table aggregation approach | |
# look for most recent value of P1/P2/P3 based upon max index for a group by week & category | |
# | |
head(incidents) | |
# add an index value for the row | |
incidents$index <- as.numeric(rownames(incidents)) | |
head(incidents) | |
incidentsDT3 <- data.table(incidents) | |
head(incidentsDT3) | |
# what does this aggregation look like... | |
head(incidentsDT3[,list(index1=max(index), P1, P2, P3), by=list(week,category)]) | |
# giving 2587 rows... | |
nrow(incidentsDT3[,list(max(index), P1, P2, P3), by=list(week,category)]) | |
# and 2416 unique rows - why is it different than the 2381 obtained above? | |
# because we don't have something like a sql having clause | |
# a better view is this | |
head(incidentsDT3[,list(index, index1=max(index), P1, P2, P3), by=list(week,category)]) | |
# notice that index1 is repeated with the max value for each row: | |
# week category index index1 P1 P2 P3 | |
# 1: 28/05/2012 some categoryA 1 1156 NA NA 1 | |
# 2: 28/05/2012 some categoryA 1156 1156 NA NA 3 | |
# | |
# but you can't just include an expression such as index==index1 in the i position | |
# because i is evaluated before j | |
# bugger, does this force me to take a two step approach? | |
incidentsDT4 <- incidentsDT3[,list(index, index1=max(index), P1, P2, P3), by=list(week,category)] | |
head(incidentsDT4[index == index1, list(index, index1, P1, P2, P3), by = list(week, category)]) | |
# and row count is 2381 matching result from mean approach | |
nrow(incidentsDT4[index == index1, list(index, index1, P1, P2, P3), by = list(week, category)]) | |
# | |
# let's use up some more memory and create a cleaner data table | |
# | |
incidentsDT5 <- incidentsDT4[index == index1, list(P1, P2, P3), by = list(week, category)] | |
# should I make NA 0? Maybe not | |
# | |
# let's make some weekly histograms | |
# | |
hist(incidentsDT5[,list(SumP1=sum(P1)),by=week][,SumP1]) # oops - NA + a valid number = NA | |
hist(incidentsDT5[,list(SumP2=sum(P2)),by=week][,SumP2]) # ditto | |
hist(incidentsDT5[,list(SumP3=sum(P3)),by=week][,SumP3]) # this works because there's no NAs | |
hist(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1]) # lots of zeros in here - might be sensible to model - does zero inflated regression apply here? | |
# how about we change NA to allow summation | |
# remove any 0 values | |
# and if we're doing this,then why not create some more clean data tables | |
WeeklySumP1=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][SumP1>0,list(days=as.integer(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP1)] | |
WeeklySumP2=incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][SumP2>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP2)] | |
WeeklySumP3=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP3)] | |
hist(WeeklySumP1[,SumP1]) | |
hist(WeeklySumP2[,SumP2]) | |
hist(WeeklySumP3[,SumP3]) | |
head(hist(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1])) | |
WeeklySumP1=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][SumP1>0,list(days=as.integer(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP1)] | |
WeeklySumP2=incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][SumP2>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP2)] | |
WeeklySumP3=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP3)] | |
# plot out the weekly P1, P2, P3 so we can see how they trend | |
qplot(week,SumP1,data=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,list(week=as.Date(week,"%d/%m/%Y"),SumP1)]) | |
# interesting, above gives the 0 value points, we don't want to see them | |
qplot(week,SumP1,data=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][SumP1>0,list(week=as.Date(week,"%d/%m/%Y"),SumP1)]) | |
qplot(week,SumP2,data=incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][SumP2>0,list(week=as.Date(week,"%d/%m/%Y"),SumP2)]) | |
qplot(week,SumP3,data=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(week=as.Date(week,"%d/%m/%Y"),SumP3)]) | |
# | |
# the P3 plot shows the distribution of points climbing in a linear fashion | |
# what I think should be done is evaluate the distribution at 3 points | |
# fit to poisson or whatever best fits all 3 | |
# then apply a linear contribution to make this grow appropriately over time | |
# | |
# | |
# let's try some dead salmon views:) | |
# (make these data.tables a little easier... and note that vwReg wants them coerced to data.frames) | |
# so I will now rudely interrupt my code with a function | |
############################################################################################## | |
# Copyright 2012 Felix Schönbrodt | |
# All rights reserved. | |
# | |
# FreeBSD License | |
# | |
# Redistribution and use in source and binary forms, with or without | |
# modification, are permitted provided that the following conditions are | |
# met: | |
# | |
# 1. Redistributions of source code must retain the above copyright | |
# notice, this list of conditions and the following disclaimer. | |
# | |
# 2. Redistributions in binary form must reproduce the above copyright | |
# notice, this list of conditions and the following disclaimer in the | |
# documentation and/or other materials provided with the distribution. | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER `AS IS'' AND ANY | |
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR | |
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
# | |
# The views and conclusions contained in the software and documentation | |
# are those of the authors and should not be interpreted as representing | |
# official policies, either expressed or implied, of the copyright | |
# holder. | |
# Version history: | |
# 0.1: original code | |
# 0.1.1: changed license to FreeBSD; re-established compability to ggplot2 (new version 0.9.2) | |
## Visually weighted regression / Watercolor plots | |
## Idea: Solomon Hsiang, with additional ideas from many blog commenters | |
# B = number bootstrapped smoothers | |
# shade: plot the shaded confidence region? | |
# shade.alpha: should the CI shading fade out at the edges? (by reducing alpha; 0 = no alpha decrease, 0.1 = medium alpha decrease, 0.5 = strong alpha decrease) | |
# spag: plot spaghetti lines? | |
# spag.color: color of spaghetti lines | |
# mweight: should the median smoother be visually weighted? | |
# show.lm: should the linear regresison line be plotted? | |
# show.CI: should the 95% CI limits be plotted? | |
# show.median: should the median smoother be plotted? | |
# median.col: color of the median smoother | |
# shape: shape of points | |
# method: the fitting function for the spaghettis; default: loess | |
# bw = TRUE: define a default b&w-palette | |
# slices: number of slices in x and y direction for the shaded region. Higher numbers make a smoother plot, but takes longer to draw. I wouldn'T go beyond 500 | |
# palette: provide a custom color palette for the watercolors | |
# ylim: restrict range of the watercoloring | |
# quantize: either "continuous", or "SD". In the latter case, we get three color regions for 1, 2, and 3 SD (an idea of John Mashey) | |
# add: if add == FALSE, a new ggplot is returned. If add == TRUE, only the elements are returned, which can be added to an existing ggplot (with the '+' operator) | |
# ...: further parameters passed to the fitting function, in the case of loess, for example, "span = .9", or "family = 'symmetric'" | |
vwReg <- function(formula, data, title="", B=1000, shade=TRUE, shade.alpha=.1, spag=FALSE, spag.color="darkblue", mweight=TRUE, show.lm=FALSE, show.median = TRUE, median.col = "white", shape = 21, show.CI=FALSE, method=loess, bw=FALSE, slices=200, palette=colorRampPalette(c("#FFEDA0", "#DD0000"), bias=2)(20), ylim=NULL, quantize = "continuous", add=FALSE, ...) { | |
IV <- all.vars(formula)[2] | |
DV <- all.vars(formula)[1] | |
data <- na.omit(data[order(data[, IV]), c(IV, DV)]) | |
if (bw == TRUE) { | |
palette <- colorRampPalette(c("#EEEEEE", "#999999", "#333333"), bias=2)(20) | |
} | |
print("Computing boostrapped smoothers ...") | |
newx <- data.frame(seq(min(data[, IV]), max(data[, IV]), length=slices)) | |
colnames(newx) <- IV | |
l0.boot <- matrix(NA, nrow=nrow(newx), ncol=B) | |
l0 <- method(formula, data) | |
for (i in 1:B) { | |
data2 <- data[sample(nrow(data), replace=TRUE), ] | |
data2 <- data2[order(data2[, IV]), ] | |
if (class(l0)=="loess") { | |
m1 <- method(formula, data2, control = loess.control(surface = "i", statistics="a", trace.hat="a"), ...) | |
} else { | |
m1 <- method(formula, data2, ...) | |
} | |
l0.boot[, i] <- predict(m1, newdata=newx) | |
} | |
# compute median and CI limits of bootstrap | |
library(plyr) | |
library(reshape2) | |
CI.boot <- adply(l0.boot, 1, function(x) quantile(x, prob=c(.025, .5, .975, pnorm(c(-3, -2, -1, 0, 1, 2, 3))), na.rm=TRUE))[, -1] | |
colnames(CI.boot)[1:10] <- c("LL", "M", "UL", paste0("SD", 1:7)) | |
CI.boot$x <- newx[, 1] | |
CI.boot$width <- CI.boot$UL - CI.boot$LL | |
# scale the CI width to the range 0 to 1 and flip it (bigger numbers = narrower CI) | |
CI.boot$w2 <- (CI.boot$width - min(CI.boot$width)) | |
CI.boot$w3 <- 1-(CI.boot$w2/max(CI.boot$w2)) | |
# convert bootstrapped spaghettis to long format | |
b2 <- melt(l0.boot) | |
b2$x <- newx[,1] | |
colnames(b2) <- c("index", "B", "value", "x") | |
library(ggplot2) | |
library(RColorBrewer) | |
# Construct ggplot | |
# All plot elements are constructed as a list, so they can be added to an existing ggplot | |
# if add == FALSE: provide the basic ggplot object | |
p0 <- ggplot(data, aes_string(x=IV, y=DV)) + theme_bw() | |
# initialize elements with NULL (if they are defined, they are overwritten with something meaningful) | |
gg.tiles <- gg.poly <- gg.spag <- gg.median <- gg.CI1 <- gg.CI2 <- gg.lm <- gg.points <- gg.title <- NULL | |
if (shade == TRUE) { | |
quantize <- match.arg(quantize, c("continuous", "SD")) | |
if (quantize == "continuous") { | |
print("Computing density estimates for each vertical cut ...") | |
flush.console() | |
if (is.null(ylim)) { | |
min_value <- min(min(l0.boot, na.rm=TRUE), min(data[, DV], na.rm=TRUE)) | |
max_value <- max(max(l0.boot, na.rm=TRUE), max(data[, DV], na.rm=TRUE)) | |
ylim <- c(min_value, max_value) | |
} | |
# vertical cross-sectional density estimate | |
d2 <- ddply(b2[, c("x", "value")], .(x), function(df) { | |
res <- data.frame(density(df$value, na.rm=TRUE, n=slices, from=ylim[1], to=ylim[2])[c("x", "y")]) | |
#res <- data.frame(density(df$value, na.rm=TRUE, n=slices)[c("x", "y")]) | |
colnames(res) <- c("y", "dens") | |
return(res) | |
}, .progress="text") | |
maxdens <- max(d2$dens) | |
mindens <- min(d2$dens) | |
d2$dens.scaled <- (d2$dens - mindens)/maxdens | |
## Tile approach | |
d2$alpha.factor <- d2$dens.scaled^shade.alpha | |
gg.tiles <- list(geom_tile(data=d2, aes(x=x, y=y, fill=dens.scaled, alpha=alpha.factor)), scale_fill_gradientn("dens.scaled", colours=palette), scale_alpha_continuous(range=c(0.001, 1))) | |
} | |
if (quantize == "SD") { | |
## Polygon approach | |
SDs <- melt(CI.boot[, c("x", paste0("SD", 1:7))], id.vars="x") | |
count <- 0 | |
d3 <- data.frame() | |
col <- c(1,2,3,3,2,1) | |
for (i in 1:6) { | |
seg1 <- SDs[SDs$variable == paste0("SD", i), ] | |
seg2 <- SDs[SDs$variable == paste0("SD", i+1), ] | |
seg <- rbind(seg1, seg2[nrow(seg2):1, ]) | |
seg$group <- count | |
seg$col <- col[i] | |
count <- count + 1 | |
d3 <- rbind(d3, seg) | |
} | |
gg.poly <- list(geom_polygon(data=d3, aes(x=x, y=value, color=NULL, fill=col, group=group)), scale_fill_gradientn("dens.scaled", colours=palette, values=seq(-1, 3, 1))) | |
} | |
} | |
print("Build ggplot figure ...") | |
flush.console() | |
if (spag==TRUE) { | |
gg.spag <- geom_path(data=b2, aes(x=x, y=value, group=B), size=0.7, alpha=10/B, color=spag.color) | |
} | |
if (show.median == TRUE) { | |
if (mweight == TRUE) { | |
gg.median <- geom_path(data=CI.boot, aes(x=x, y=M, alpha=w3^3), size=.6, linejoin="mitre", color=median.col) | |
} else { | |
gg.median <- geom_path(data=CI.boot, aes(x=x, y=M), size = 0.6, linejoin="mitre", color=median.col) | |
} | |
} | |
# Confidence limits | |
if (show.CI == TRUE) { | |
gg.CI1 <- geom_path(data=CI.boot, aes(x=x, y=UL), size=1, color="red") | |
gg.CI2 <- geom_path(data=CI.boot, aes(x=x, y=LL), size=1, color="red") | |
} | |
# plain linear regression line | |
if (show.lm==TRUE) {gg.lm <- geom_smooth(method="lm", color="darkgreen", se=FALSE)} | |
gg.points <- geom_point(data=data, aes_string(x=IV, y=DV), size=2, shape=shape, fill="tan", color="black") | |
if (title != "") { | |
gg.title <- theme(title=title) | |
} | |
gg.elements <- list(gg.tiles, gg.poly, gg.spag, gg.median, gg.CI1, gg.CI2, gg.lm, gg.points, gg.title, theme(legend.position="none")) | |
if (add == FALSE) { | |
return(p0 + gg.elements) | |
} else { | |
return(gg.elements) | |
} | |
} | |
####################################################### | |
vwReg(SumP1~days, as.data.frame(WeeklySumP1), span=1, spag=T, show.lm=T, palette=brewer.pal(9,"YlGnBu")) # I like these better | |
vwReg(SumP2~days, as.data.frame(WeeklySumP2), span=1, spag=T, show.lm=T, palette=brewer.pal(9,"YlGnBu")) | |
vwReg(SumP3~days, as.data.frame(WeeklySumP3), span=1, spag=T, show.lm=T, palette=brewer.pal(9,"YlGnBu")) | |
# | |
# those P3s are increasing over time but the distribution looks the same, let's fit a linear model and display eqn on the plot | |
# I've copied this handy function from http://stackoverflow.com/questions/7549694/ggplot2-adding-regression-line-equation-and-r2-on-graph | |
# | |
lm_eqn = function(m) { | |
l <- list(a = format(coef(m)[1], digits = 2), | |
b = format(abs(coef(m)[2]), digits = 2), | |
r2 = format(summary(m)$r.squared, digits = 3)); | |
if (coef(m)[2] >= 0) { | |
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l) | |
} else { | |
eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l) | |
} | |
as.character(as.expression(eq)); | |
} | |
P3=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(week=as.Date(week,"%d/%m/%Y"),SumP3)] | |
ggplot(P3, aes(x=week, y=SumP3)) + | |
geom_point() + | |
geom_smooth(method=lm) + | |
geom_text(x=as.integer(as.Date("2013-05-20")), y=70, label = lm_eqn(lm(SumP3 ~ week, P3)), parse=T) | |
# | |
# so I think approach is this | |
# | |
# create an empty plot | |
# iterate over some number of trials | |
# each trial is a sample with replacement going out some number of weeks eg 3*52 | |
# but should we account for decreasing rate of P1s and P2s; and increasing rate of P3s? | |
# suggest safest to keep P1/P2 rate the same and allow for increasing P3 rate with linear model | |
# calculate cost each week (cost of P1s + cost of P2s + cost of P3s) | |
# add a line to a plot | |
# | |
# but first, lets plot historic data | |
# | |
mergedWeeklyIncidents <-merge(merge(WeeklySumP1, WeeklySumP2), WeeklySumP3) | |
mergedWeeklyIncidents$cost <- 100000*mergedWeeklyIncidents$SumP1 + | |
10000*mergedWeeklyIncidents$SumP2 + | |
1000*mergedWeeklyIncidents$SumP3 | |
# seems that language parsing for cbind() and c() is easier in either case if we | |
# don't put expressions inside, so let's build some intermediate steps | |
projecting=3*52 # 3 years in week intervals | |
projectedWeeks = seq(max(mergedWeeklyIncidents$week), by="week", length.out=projecting) | |
trials=10 | |
CostOfP1 = 100000 | |
CostOfP2 = 10000 | |
CostOfP3 = 1000 | |
plot(mergedWeeklyIncidents$cost, type="l", col=rgb(0,1,0), xlim=c(0,x2), xlab="week", ylab="Cost/week") | |
for(i in 1:trials) { | |
P1Cost=CostOfP1*sample(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1], 1000, replace=T) | |
P2Cost=CostOfP2*sample(incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][,SumP2], 1000, replace=T) | |
P3Cost=CostOfP3*sample(incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][,SumP3], 1000, replace=T) | |
projectedWeeklyCost <- data.frame(cbind(x=xs, y=P1Cost+P2Cost+P3Cost)) | |
lines(projectedWeeklyCost, col=rgb(0,0,1, alpha=0.1)) | |
} | |
# add an average line | |
avgWeeklyCost = CostOfP1*mean(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1]) + | |
CostOfP2*mean(incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][,SumP2]) + | |
CostOfP3*mean(incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][,SumP3]) | |
lines(data.frame(cbind(x=xs, y=rep(avgWeeklyCost,1000))), col=rgb(1,0,0)) | |
# | |
# that worked but it's clear that the distributions of P1s, P2s and P3s are changing with | |
# time, the P1s and P2s are decreasing a bit but the P3s increasing significantly | |
# how about we fit the weekly sum of the P3s to a linear model and extrapolate out? | |
# | |
# actually, how about we subtract the linear average off and redo the distribution, then take our | |
# samples and add in the linear contribution | |
# this way, we should get a cleaner distribution and better bootstrapping for future values | |
# | |
# just to double check the validity of this, first I want to plot estimated values against | |
# original | |
t<-WeeklySumP3 | |
t$est <- 0.01042*t$days-121.30926 | |
ggplot(t, aes(days, SumP3)) + geom_point() + geom_point(data=t, aes(days, est)) | |
# yep, lm was correct | |
# so now, I'm going to create a new data set with the linear contribution subtracted | |
# and baselined at the beginning | |
t$base <- t$SumP3 - (t$days-min(t$days))*0.01042 | |
# check we're right | |
ggplot(t, aes(days, SumP3)) + geom_point() + | |
geom_point(data=t, aes(days, est, colour='red')) + | |
geom_point(data=t, aes(days, base, colour='blue')) | |
# weird, colours got mixed up. The aesthetics must be getting changed | |
# let's try fitting that adjusted distribution to something from which we can calculate values | |
library(MASS) | |
# we'll try a few...:) (why are colours getting mixed up?) | |
fitdistr(t$SumP3, 'Poisson') | |
ggplot(data=data.frame(cbind(original=t$SumP3, predict=rpois(1000,37)))) + | |
geom_histogram(aes(original, fill="red", alpha=0.2, binwidth=10)) + | |
geom_histogram(aes(predict, fill="blue", alpha=0.2, binwidth=10)) | |
fitdistr(t$SumP3, 'weibull') | |
ggplot(data=data.frame(cbind(original=t$SumP3, predict=rweibull(1000, 3.16, 41.2)))) + | |
geom_histogram(aes(x=original, fill="red", alpha=0.2, binwidth=10)) + | |
geom_histogram(aes(x=predict, fill="blue", alpha=0.2, binwidth=10)) | |
fitdistr(t$SumP3, 'log-normal') | |
ggplot(data=data.frame(cbind(original=t$SumP3, predict=rlnorm(1000, meanlog=3.5, sdlog=0.42)))) + | |
geom_histogram(colour="red", aes(x=original, alpha=0.2, binwidth=10)) + | |
geom_histogram(colour="blue", aes(x=predict, alpha=0.2, binwidth=10)) | |
# nice, but perhaps better to just bootstrap from real data and apply the linear adjustment | |
# first, replot the historical data - note that I originally had this graph generated | |
# from a vector representation of the cost - this had the data sorted in reverse order | |
# lesson - always make sure you have x and y! | |
plot(mergedWeeklyIncidents$week, mergedWeeklyIncidents$cost, type="l", col=rgb(0,1,0), xlim=c(min(mergedWeeklyIncidents$week), max(projectedWeeks)), xlab="week", ylab="Cost/week") | |
# then project out a number of weeks using variable 'projecting' | |
for (trial in 1:trials) { | |
projectedWeeklyCost = data.frame(week=projectedWeeks, | |
P1Cost = CostOfP1*sample(WeeklySumP1$SumP1, length(projectedWeeks), replace=T), | |
P2Cost = CostOfP2*sample(WeeklySumP2$SumP2, length(projectedWeeks), replace=T)) | |
# this next one is a little tricky, the sample gets a linear contribution | |
projectedWeeklyCost$P3SampledBase <- sample(t$base, nrow(projectedWeeklyCost), replace=T) | |
projectedWeeklyCost$P3LinearAdjusted <- projectedWeeklyCost$P3SampledBase + (projectedWeeklyCost$week-min(mergedWeeklyIncidents$week))*0.01042 | |
projectedWeeklyCost$P3Cost <- CostOfP3*projectedWeeklyCost$P3LinearAdjusted | |
projectedWeeklyCost$TotalWeeklyCost <- projectedWeeklyCost$P1Cost + projectedWeeklyCost$P2Cost + projectedWeeklyCost$P3Cost | |
lines(projectedWeeklyCost$week, projectedWeeklyCost$TotalWeeklyCost, col=rgb(0,0,1, alpha=0.2)) | |
} | |
# | |
# that worked but I want to make this work with ggplot | |
# | |
library(scales) | |
p <- ggplot() + scale_y_continuous(labels=dollar, "Weekly Cost") + xlab("Year") + theme(text=element_text(size=20)) | |
p <- p + geom_line(data=mergedWeeklyIncidents, colour="green", aes(x = week, y = cost)) + | |
xlim(min(mergedWeeklyIncidents$week),max(projectedWeeklyCost$week)) | |
# then project out a number of weeks using variable 'projecting' | |
for (trial in 1:trials) { | |
projectedWeeklyCost = data.frame(week=projectedWeeks, | |
P1Cost = CostOfP1*sample(WeeklySumP1$SumP1, length(projectedWeeks), replace=T), | |
P2Cost = CostOfP2*sample(WeeklySumP2$SumP2, length(projectedWeeks), replace=T)) | |
# this next one is a little tricky, the sample gets a linear contribution | |
projectedWeeklyCost$P3SampledBase <- sample(t$base, nrow(projectedWeeklyCost), replace=T) | |
projectedWeeklyCost$P3LinearAdjusted <- projectedWeeklyCost$P3SampledBase + (projectedWeeklyCost$week-min(mergedWeeklyIncidents$week))*0.01042 | |
projectedWeeklyCost$P3Cost <- CostOfP3*projectedWeeklyCost$P3LinearAdjusted | |
projectedWeeklyCost$TotalWeeklyCost <- projectedWeeklyCost$P1Cost + projectedWeeklyCost$P2Cost + projectedWeeklyCost$P3Cost | |
p <- p + geom_line(data=projectedWeeklyCost, colour="blue", alpha=0.1, linewidth=0.1, aes(x = week, y = as.integer(TotalWeeklyCost))) | |
} | |
print(p) | |
# and a histogram of the projected costs | |
qplot(as.numeric(TotalWeeklyCost), data=projectedWeeklyCost, geom="histogram", binwidth=20000) + scale_x_continuous(labels=dollar, "Weekly Cost") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment