bohdanszymanik · December 18, 2015 12:09
diff --git a/IncidentAnalysis.R b/IncidentAnalysis.R
 # data was closed incidents in a tab separated txt file that looked like this
 # week  category  P1  P2  P3
 # sucked up into R it formed a data frame like so
 # 1 28/05/2012  some category1  NA  NA  1
 # 2 28/05/2012  some category2  NA  2   1
 # 3 28/05/2012  some category3  1   NA  1
 # 4 28/05/2012  some category4  NA  NA  2

 # as it happens, this data was manually collected from excel spreadsheets
 # unfortunately the spreadsheets were for differing week ranges and there was some overlap with
 # occasionally differing values for P1, P2, P3
 # 
 # the best approach here would be to take the last (ie latest) values for a given week and category
 # because these would have been the most accurate (containing all those recently closed)
 # the data frame was ordered by recency of the data so this could potentially work... 
 # otherwise a rough way was just to average the data:)
 #
 # how about we try both and check they roughly compare...

 # start by loading up a data.frame
 incidents <- read.delim("C:/Temp/incidents1.txt", header=F)
 View(incidents)
 colnames(incidents) <- c("week", "category", "P1", "P2", "P3")
 nrow(incidents)
 nrow(unique(incidents))
 # 2587 rows in either case above

 # plot out data - to do this we need to reshape so that P1, P2, P3 columns become two columns 
 # one being the type of the incident, the other the count of that incident type
 library(reshape)
 incidentsMelt <- melt(incidents[,c("week", "category", "P1", "P2", "P3")],id=c("week", "category"))

 library(ggplot2)
 qplot(as.Date(week,"%d/%m/%Y"), value, data=incidentsMelt, colour = variable)

 # time to sort this data out

 library(data.table)
 incidentsDT <- data.table(incidents)
 setkey(incidentsDT, week, category)

 # data.table gives speedy queries when data > 100k rows (not thecase here)
 # fast binary scan across keyed data - notice the J to denote a select
 incidentsDT[J("28/05/2012", "some categoryA")]

 # vector search, but it provides wildcard querying and regex
 incidentsDT[(week=="28/05/2012" & category %like% "Application Delvry")]

 #
 # let's try the mean approach to dealing with repeated, but differing, entries
 #

 incidentsDT2 <- incidentsDT

 incidentsDT2[, MeanP1:=mean(P1), by=list(week, category)]
 incidentsDT2[, MeanP2:=mean(P2), by=list(week, category)]
 incidentsDT2[, MeanP3:=mean(P3), by=list(week, category)]
 head(incidentsDT2)
 nrow(incidentsDT2) # 2587 rows
 tables()
 sapply(incidentsDT2, class)

 # selecting columns of a data frame
 head(incidents[,c('week', 'category')])
 # selecting columns of a data table - use list instead of c
 tail(incidentsDT2[, list(week, category)])
 nrow(unique(incidentsDT2[, list(week, category, MeanP1, MeanP2, MeanP3)])) # 2381 rows

 #
 # let's try a data.table aggregation approach
 # look for most recent value of P1/P2/P3 based upon max index for a group by week & category
 #
 head(incidents)
 # add an index value for the row
 incidents$index <- as.numeric(rownames(incidents))
 head(incidents)
 incidentsDT3 <- data.table(incidents)
 head(incidentsDT3)
 # what does this aggregation look like...
 head(incidentsDT3[,list(index1=max(index), P1, P2, P3), by=list(week,category)])
 # giving 2587 rows...
 nrow(incidentsDT3[,list(max(index), P1, P2, P3), by=list(week,category)])
 # and 2416 unique rows - why is it different than the 2381 obtained above?
 # because we don't have something like a sql having clause
 # a better view is this
 head(incidentsDT3[,list(index, index1=max(index), P1, P2, P3), by=list(week,category)])
 # notice that index1 is repeated with the max value for each row:
 # week                       category index index1 P1 P2 P3
 # 1: 28/05/2012 some categoryA     1   1156 NA NA  1
 # 2: 28/05/2012 some categoryA  1156   1156 NA NA  3
 #
 # but you can't just include an expression such as index==index1 in the i position
 # because i is evaluated before j
 # bugger, does this force me to take a two step approach?
 incidentsDT4 <- incidentsDT3[,list(index, index1=max(index), P1, P2, P3), by=list(week,category)]
 head(incidentsDT4[index == index1, list(index, index1, P1, P2, P3), by = list(week, category)])
 # and row count is 2381 matching result from mean approach
 nrow(incidentsDT4[index == index1, list(index, index1, P1, P2, P3), by = list(week, category)])

 #
 # let's use up some more memory and create a cleaner data table 
 #
 incidentsDT5 <- incidentsDT4[index == index1, list(P1, P2, P3), by = list(week, category)]

 # should I make NA 0? Maybe not

 #
 # let's make some weekly histograms
 #
 hist(incidentsDT5[,list(SumP1=sum(P1)),by=week][,SumP1])  # oops - NA + a valid number = NA
 hist(incidentsDT5[,list(SumP2=sum(P2)),by=week][,SumP2])  # ditto
 hist(incidentsDT5[,list(SumP3=sum(P3)),by=week][,SumP3])  # this works because there's no NAs

 hist(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1]) # lots of zeros in here - might be sensible to model - does zero inflated regression apply here?

 # how about we change NA to allow summation
 # remove any 0 values
 # and if we're doing this,then why not create some more clean data tables
 WeeklySumP1=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][SumP1>0,list(days=as.integer(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP1)]
 WeeklySumP2=incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][SumP2>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP2)]
 WeeklySumP3=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP3)]

 hist(WeeklySumP1[,SumP1]) 
 hist(WeeklySumP2[,SumP2])
 hist(WeeklySumP3[,SumP3])
 head(hist(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1]))

 WeeklySumP1=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][SumP1>0,list(days=as.integer(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP1)]
 WeeklySumP2=incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][SumP2>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP2)]
 WeeklySumP3=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(days=as.numeric(as.Date(week,"%d/%m/%Y")), week=as.Date(week,"%d/%m/%Y"),SumP3)]

 # plot out the weekly P1, P2, P3 so we can see how they trend
 qplot(week,SumP1,data=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,list(week=as.Date(week,"%d/%m/%Y"),SumP1)])
 # interesting, above gives the 0 value points, we don't want to see them
 qplot(week,SumP1,data=incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][SumP1>0,list(week=as.Date(week,"%d/%m/%Y"),SumP1)])
 qplot(week,SumP2,data=incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][SumP2>0,list(week=as.Date(week,"%d/%m/%Y"),SumP2)])
 qplot(week,SumP3,data=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(week=as.Date(week,"%d/%m/%Y"),SumP3)])

 #
 # the P3 plot shows the distribution of points climbing in a linear fashion
 # what I think should be done is evaluate the distribution at 3 points
 # fit to poisson or whatever best fits all 3
 # then apply a linear contribution to make this grow appropriately over time
 #


 #
 # let's try some dead salmon views:)
 # (make these data.tables a little easier... and note that vwReg wants them coerced to data.frames)

 # so I will now rudely interrupt my code with a function

 ##############################################################################################


 # Copyright 2012 Felix Schönbrodt
 # All rights reserved.
 #
 # FreeBSD License
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 #
 # 1. Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 #
 # 2. Redistributions in binary form must reproduce the above copyright
 # notice, this list of conditions and the following disclaimer in the
 # documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER `AS IS'' AND ANY
 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # The views and conclusions contained in the software and documentation
 # are those of the authors and should not be interpreted as representing
 # official policies, either expressed or implied, of the copyright
 # holder.

 # Version history:
 # 0.1: original code
 # 0.1.1: changed license to FreeBSD; re-established compability to ggplot2 (new version 0.9.2)

 ## Visually weighted regression / Watercolor plots
 ## Idea: Solomon Hsiang, with additional ideas from many blog commenters


 # B = number bootstrapped smoothers
 # shade: plot the shaded confidence region?
 # shade.alpha: should the CI shading fade out at the edges? (by reducing alpha; 0 = no alpha decrease, 0.1 = medium alpha decrease, 0.5 = strong alpha decrease)
 # spag: plot spaghetti lines?
 # spag.color: color of spaghetti lines
 # mweight: should the median smoother be visually weighted?
 # show.lm: should the linear regresison line be plotted?
 # show.CI: should the 95% CI limits be plotted?
 # show.median: should the median smoother be plotted?
 # median.col: color of the median smoother
 # shape: shape of points
 # method: the fitting function for the spaghettis; default: loess
 # bw = TRUE: define a default b&w-palette
 # slices: number of slices in x and y direction for the shaded region. Higher numbers make a smoother plot, but takes longer to draw. I wouldn'T go beyond 500
 # palette: provide a custom color palette for the watercolors
 # ylim: restrict range of the watercoloring
 # quantize: either "continuous", or "SD". In the latter case, we get three color regions for 1, 2, and 3 SD (an idea of John Mashey)
 # add: if add == FALSE, a new ggplot is returned. If add == TRUE, only the elements are returned, which can be added to an existing ggplot (with the '+' operator)
 # ...: further parameters passed to the fitting function, in the case of loess, for example, "span = .9", or "family = 'symmetric'"
 vwReg <- function(formula, data, title="", B=1000, shade=TRUE, shade.alpha=.1, spag=FALSE, spag.color="darkblue", mweight=TRUE, show.lm=FALSE, show.median = TRUE, median.col = "white", shape = 21, show.CI=FALSE, method=loess, bw=FALSE, slices=200, palette=colorRampPalette(c("#FFEDA0", "#DD0000"), bias=2)(20), ylim=NULL, quantize = "continuous",  add=FALSE, ...) {
  IV <- all.vars(formula)[2]
  DV <- all.vars(formula)[1]
  data <- na.omit(data[order(data[, IV]), c(IV, DV)])
  
  if (bw == TRUE) {
    palette <- colorRampPalette(c("#EEEEEE", "#999999", "#333333"), bias=2)(20)
  }
  
  print("Computing boostrapped smoothers ...")
  newx <- data.frame(seq(min(data[, IV]), max(data[, IV]), length=slices))
  colnames(newx) <- IV
  l0.boot <- matrix(NA, nrow=nrow(newx), ncol=B)
  
  l0 <- method(formula, data)
  for (i in 1:B) {
    data2 <- data[sample(nrow(data), replace=TRUE), ]
    data2 <- data2[order(data2[, IV]), ]
    if (class(l0)=="loess") {
      m1 <- method(formula, data2, control = loess.control(surface = "i", statistics="a", trace.hat="a"), ...)
    } else {
      m1 <- method(formula, data2, ...)
    }
    l0.boot[, i] <- predict(m1, newdata=newx)
  }

  # compute median and CI limits of bootstrap
  library(plyr)
  library(reshape2)
  CI.boot <- adply(l0.boot, 1, function(x) quantile(x, prob=c(.025, .5, .975, pnorm(c(-3, -2, -1, 0, 1, 2, 3))), na.rm=TRUE))[, -1]
  colnames(CI.boot)[1:10] <- c("LL", "M", "UL", paste0("SD", 1:7))
  CI.boot$x <- newx[, 1]
  CI.boot$width <- CI.boot$UL - CI.boot$LL
  
  # scale the CI width to the range 0 to 1 and flip it (bigger numbers = narrower CI)
  CI.boot$w2 <- (CI.boot$width - min(CI.boot$width))
  CI.boot$w3 <- 1-(CI.boot$w2/max(CI.boot$w2))
  
  
  # convert bootstrapped spaghettis to long format
  b2 <- melt(l0.boot)
  b2$x <- newx[,1]
  colnames(b2) <- c("index", "B", "value", "x")
  
  library(ggplot2)
  library(RColorBrewer)
  
  # Construct ggplot
  # All plot elements are constructed as a list, so they can be added to an existing ggplot
  
  # if add == FALSE: provide the basic ggplot object
  p0 <- ggplot(data, aes_string(x=IV, y=DV)) + theme_bw()
  
  # initialize elements with NULL (if they are defined, they are overwritten with something meaningful)
  gg.tiles <- gg.poly <- gg.spag <- gg.median <- gg.CI1 <- gg.CI2 <- gg.lm <- gg.points <- gg.title <- NULL
  
  if (shade == TRUE) {
    quantize <- match.arg(quantize, c("continuous", "SD"))
    if (quantize == "continuous") {
      print("Computing density estimates for each vertical cut ...")
      flush.console()
      
      if (is.null(ylim)) {
        min_value <- min(min(l0.boot, na.rm=TRUE), min(data[, DV], na.rm=TRUE))
        max_value <- max(max(l0.boot, na.rm=TRUE), max(data[, DV], na.rm=TRUE))
        ylim <- c(min_value, max_value)
      }
      
      # vertical cross-sectional density estimate
      d2 <- ddply(b2[, c("x", "value")], .(x), function(df) {
        res <- data.frame(density(df$value, na.rm=TRUE, n=slices, from=ylim[1], to=ylim[2])[c("x", "y")])
        #res <- data.frame(density(df$value, na.rm=TRUE, n=slices)[c("x", "y")])
        colnames(res) <- c("y", "dens")
        return(res)
      }, .progress="text")
      
      maxdens <- max(d2$dens)
      mindens <- min(d2$dens)
      d2$dens.scaled <- (d2$dens - mindens)/maxdens
      
      ## Tile approach
      d2$alpha.factor <- d2$dens.scaled^shade.alpha
      gg.tiles <-  list(geom_tile(data=d2, aes(x=x, y=y, fill=dens.scaled, alpha=alpha.factor)), scale_fill_gradientn("dens.scaled", colours=palette), scale_alpha_continuous(range=c(0.001, 1)))
    }
    if (quantize == "SD") {
      ## Polygon approach
      
      SDs <- melt(CI.boot[, c("x", paste0("SD", 1:7))], id.vars="x")
      count <- 0
      d3 <- data.frame()
      col <- c(1,2,3,3,2,1)
      for (i in 1:6) {
        seg1 <- SDs[SDs$variable == paste0("SD", i), ]
        seg2 <- SDs[SDs$variable == paste0("SD", i+1), ]
        seg <- rbind(seg1, seg2[nrow(seg2):1, ])
        seg$group <- count
        seg$col <- col[i]
        count <- count + 1
        d3 <- rbind(d3, seg)
      }
      
      gg.poly <-  list(geom_polygon(data=d3, aes(x=x, y=value, color=NULL, fill=col, group=group)), scale_fill_gradientn("dens.scaled", colours=palette, values=seq(-1, 3, 1)))
    }
  }
  
  print("Build ggplot figure ...")
  flush.console()
  
  
  if (spag==TRUE) {
    gg.spag <-  geom_path(data=b2, aes(x=x, y=value, group=B), size=0.7, alpha=10/B, color=spag.color)
  }
  
  if (show.median == TRUE) {
    if (mweight == TRUE) {
      gg.median <-  geom_path(data=CI.boot, aes(x=x, y=M, alpha=w3^3), size=.6, linejoin="mitre", color=median.col)
    } else {
      gg.median <-  geom_path(data=CI.boot, aes(x=x, y=M), size = 0.6, linejoin="mitre", color=median.col)
    }
  }
  
  # Confidence limits
  if (show.CI == TRUE) {
    gg.CI1 <- geom_path(data=CI.boot, aes(x=x, y=UL), size=1, color="red")
    gg.CI2 <- geom_path(data=CI.boot, aes(x=x, y=LL), size=1, color="red")
  }
  
  # plain linear regression line
  if (show.lm==TRUE) {gg.lm <- geom_smooth(method="lm", color="darkgreen", se=FALSE)}
  
  gg.points <- geom_point(data=data, aes_string(x=IV, y=DV), size=2, shape=shape, fill="tan", color="black")
  
  if (title != "") {
    gg.title <- theme(title=title)
  }
  
  
  gg.elements <- list(gg.tiles, gg.poly, gg.spag, gg.median, gg.CI1, gg.CI2, gg.lm, gg.points, gg.title, theme(legend.position="none"))
  
  if (add == FALSE) {
    return(p0 + gg.elements)
  } else {
    return(gg.elements)
  }
 }

 #######################################################

 vwReg(SumP1~days, as.data.frame(WeeklySumP1), span=1, spag=T, show.lm=T, palette=brewer.pal(9,"YlGnBu")) # I like these  better
 vwReg(SumP2~days, as.data.frame(WeeklySumP2), span=1, spag=T, show.lm=T, palette=brewer.pal(9,"YlGnBu")) 
 vwReg(SumP3~days, as.data.frame(WeeklySumP3), span=1, spag=T, show.lm=T, palette=brewer.pal(9,"YlGnBu"))

 #
 # those P3s are increasing over time but the distribution looks the same, let's fit a linear model and display eqn on the plot
 # I've copied this handy function from http://stackoverflow.com/questions/7549694/ggplot2-adding-regression-line-equation-and-r2-on-graph
 #
 lm_eqn = function(m) {
  
  l <- list(a = format(coef(m)[1], digits = 2),
            b = format(abs(coef(m)[2]), digits = 2),
            r2 = format(summary(m)$r.squared, digits = 3));
  
  if (coef(m)[2] >= 0)  {
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,l)
  } else {
    eq <- substitute(italic(y) == a - b %.% italic(x)*","~~italic(r)^2~"="~r2,l)    
  }
  
  as.character(as.expression(eq));                 
 }
 P3=incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][SumP3>0,list(week=as.Date(week,"%d/%m/%Y"),SumP3)]

 ggplot(P3, aes(x=week, y=SumP3)) + 
  geom_point() + 
  geom_smooth(method=lm) + 
  geom_text(x=as.integer(as.Date("2013-05-20")), y=70, label = lm_eqn(lm(SumP3 ~ week, P3)), parse=T)
 #
 # so I think approach is this
 #
 # create an empty plot
 # iterate over some number of trials
 #   each trial is a sample with replacement going out some number of weeks eg 3*52
 #   but should we account for decreasing rate of P1s and P2s; and increasing rate of P3s?
 #   suggest safest to keep P1/P2 rate the same and allow for increasing P3 rate with linear model
 #   calculate cost each week (cost of P1s + cost of P2s + cost of P3s)
 #   add a line to a plot
 #
 # but first, lets plot historic data
 #

 mergedWeeklyIncidents <-merge(merge(WeeklySumP1, WeeklySumP2), WeeklySumP3)
 mergedWeeklyIncidents$cost <- 100000*mergedWeeklyIncidents$SumP1 + 
                                10000*mergedWeeklyIncidents$SumP2 +
                                1000*mergedWeeklyIncidents$SumP3

 # seems that language parsing for cbind() and c() is easier in either case if we 
 # don't put expressions inside, so let's build some intermediate steps
 projecting=3*52 # 3 years in week intervals
 projectedWeeks = seq(max(mergedWeeklyIncidents$week), by="week", length.out=projecting)
 trials=10
 CostOfP1 = 100000
 CostOfP2 = 10000
 CostOfP3 = 1000

 plot(mergedWeeklyIncidents$cost, type="l", col=rgb(0,1,0), xlim=c(0,x2), xlab="week", ylab="Cost/week")
     
 for(i in 1:trials) {
  P1Cost=CostOfP1*sample(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1], 1000, replace=T)
  P2Cost=CostOfP2*sample(incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][,SumP2], 1000, replace=T)
  P3Cost=CostOfP3*sample(incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][,SumP3], 1000, replace=T)

  projectedWeeklyCost <- data.frame(cbind(x=xs, y=P1Cost+P2Cost+P3Cost))
  
  lines(projectedWeeklyCost, col=rgb(0,0,1, alpha=0.1))
 }

 # add an average line
 avgWeeklyCost = CostOfP1*mean(incidentsDT5[,list(SumP1=sum(P1, na.rm=T)),by=week][,SumP1]) + 
       CostOfP2*mean(incidentsDT5[,list(SumP2=sum(P2, na.rm=T)),by=week][,SumP2]) +
       CostOfP3*mean(incidentsDT5[,list(SumP3=sum(P3, na.rm=T)),by=week][,SumP3])
     
 lines(data.frame(cbind(x=xs, y=rep(avgWeeklyCost,1000))), col=rgb(1,0,0))

 #
 # that worked but it's clear that the distributions of P1s, P2s and P3s are changing with 
 # time, the P1s and P2s are decreasing a bit but the P3s increasing significantly
 # how about we fit the weekly sum of the P3s to a linear model and extrapolate out?
 #
 # actually, how about we subtract the linear average off and redo the distribution, then take our 
 # samples and add in the linear contribution
 # this way, we should get a cleaner distribution and better bootstrapping for future values
 #

 # just to double check the validity of this, first I want to plot estimated values against
 # original
 t<-WeeklySumP3
 t$est <- 0.01042*t$days-121.30926
 ggplot(t, aes(days, SumP3)) + geom_point() + geom_point(data=t, aes(days, est))

 # yep, lm was correct
 # so now, I'm going to create a new data set with the linear contribution subtracted
 # and baselined at the beginning
 t$base <- t$SumP3 - (t$days-min(t$days))*0.01042
 # check we're right
 ggplot(t, aes(days, SumP3)) + geom_point() + 
  geom_point(data=t, aes(days, est, colour='red')) +
  geom_point(data=t, aes(days, base, colour='blue'))
 # weird, colours got mixed up. The aesthetics must be getting changed

 # let's try fitting that adjusted distribution to something from which we can calculate values
 library(MASS)
 # we'll try a few...:) (why are colours getting mixed up?)
 fitdistr(t$SumP3, 'Poisson')
 ggplot(data=data.frame(cbind(original=t$SumP3, predict=rpois(1000,37)))) + 
  geom_histogram(aes(original, fill="red", alpha=0.2, binwidth=10)) +
  geom_histogram(aes(predict, fill="blue", alpha=0.2, binwidth=10))

 fitdistr(t$SumP3, 'weibull')
 ggplot(data=data.frame(cbind(original=t$SumP3, predict=rweibull(1000, 3.16, 41.2)))) + 
  geom_histogram(aes(x=original, fill="red", alpha=0.2, binwidth=10)) +
  geom_histogram(aes(x=predict, fill="blue", alpha=0.2, binwidth=10))

 fitdistr(t$SumP3, 'log-normal')
 ggplot(data=data.frame(cbind(original=t$SumP3, predict=rlnorm(1000, meanlog=3.5, sdlog=0.42)))) + 
  geom_histogram(colour="red", aes(x=original, alpha=0.2, binwidth=10)) +
  geom_histogram(colour="blue", aes(x=predict, alpha=0.2, binwidth=10))

 # nice, but perhaps better to just bootstrap from real data and apply the linear adjustment

 # first, replot the historical data - note that I originally had this graph generated
 # from a vector representation of the cost - this had the data sorted in reverse order
 # lesson - always make sure you have x and y!
 plot(mergedWeeklyIncidents$week, mergedWeeklyIncidents$cost, type="l", col=rgb(0,1,0), xlim=c(min(mergedWeeklyIncidents$week), max(projectedWeeks)), xlab="week", ylab="Cost/week")

 # then project out a number of weeks using variable 'projecting'
 for (trial in 1:trials) {
  projectedWeeklyCost = data.frame(week=projectedWeeks, 
                                   P1Cost = CostOfP1*sample(WeeklySumP1$SumP1, length(projectedWeeks), replace=T),
                                   P2Cost = CostOfP2*sample(WeeklySumP2$SumP2, length(projectedWeeks), replace=T))
  
  # this next one is a little tricky, the sample gets a linear contribution  
  projectedWeeklyCost$P3SampledBase <- sample(t$base, nrow(projectedWeeklyCost), replace=T)
  projectedWeeklyCost$P3LinearAdjusted <- projectedWeeklyCost$P3SampledBase + (projectedWeeklyCost$week-min(mergedWeeklyIncidents$week))*0.01042
  projectedWeeklyCost$P3Cost <- CostOfP3*projectedWeeklyCost$P3LinearAdjusted
  
  projectedWeeklyCost$TotalWeeklyCost <- projectedWeeklyCost$P1Cost + projectedWeeklyCost$P2Cost + projectedWeeklyCost$P3Cost
  
  lines(projectedWeeklyCost$week, projectedWeeklyCost$TotalWeeklyCost, col=rgb(0,0,1, alpha=0.2))
  
 }
 #
 # that worked but I want to make this work with ggplot
 #
 library(scales)
 p <- ggplot() + scale_y_continuous(labels=dollar, "Weekly Cost") + xlab("Year") + theme(text=element_text(size=20))
 p <- p + geom_line(data=mergedWeeklyIncidents, colour="green", aes(x = week, y = cost)) + 
  xlim(min(mergedWeeklyIncidents$week),max(projectedWeeklyCost$week))

 # then project out a number of weeks using variable 'projecting'
 for (trial in 1:trials) {
  projectedWeeklyCost = data.frame(week=projectedWeeks, 
                                   P1Cost = CostOfP1*sample(WeeklySumP1$SumP1, length(projectedWeeks), replace=T),
                                   P2Cost = CostOfP2*sample(WeeklySumP2$SumP2, length(projectedWeeks), replace=T))
  
  # this next one is a little tricky, the sample gets a linear contribution  
  projectedWeeklyCost$P3SampledBase <- sample(t$base, nrow(projectedWeeklyCost), replace=T)
  projectedWeeklyCost$P3LinearAdjusted <- projectedWeeklyCost$P3SampledBase + (projectedWeeklyCost$week-min(mergedWeeklyIncidents$week))*0.01042
  projectedWeeklyCost$P3Cost <- CostOfP3*projectedWeeklyCost$P3LinearAdjusted
  
  projectedWeeklyCost$TotalWeeklyCost <- projectedWeeklyCost$P1Cost + projectedWeeklyCost$P2Cost + projectedWeeklyCost$P3Cost
    
  p <- p + geom_line(data=projectedWeeklyCost, colour="blue", alpha=0.1, linewidth=0.1, aes(x = week, y = as.integer(TotalWeeklyCost)))
 }
 print(p)

 # and a histogram of the projected costs
 qplot(as.numeric(TotalWeeklyCost), data=projectedWeeklyCost, geom="histogram", binwidth=20000) + scale_x_continuous(labels=dollar, "Weekly Cost")