danhammer · December 20, 2013 19:30
diff --git a/ts-analysis.R b/ts-analysis.R
 library(zoo)
 library(stats)
 library(dtw)
 library(timeDate)
 library(lubridate)
 library(reshape)
 library(randomForest)

 clean.pixels <- function(filename, dir) {
  ## accepts output from Pavel's script and returns a data frame with
  ## month, year, and number of registered pixels; properly sorted and
  ## named for merge with car sales data
  filepath <- paste(dir, filename, sep = "/")
  table <- read.table(filepath, sep=",", header = TRUE)
  names(table) <- c("unit", "lon", "lat", "date", "count")
  d <- as.Date(timeDate(table$date))

  pixels <- aggregate(table$count, by = list(month(d), year(d)), FUN = mean)

  # clean final data frame: drop old year and month variables, rename
  # new variables appropriately
  names(pixels) <- c("month", "year", "count")
  return(pixels)
 }

 analyze.series <- function(ndmi) {
    ## Accepts a data frame with year, month, and ndmi index (column
    ## name "num") and returns a results list with the decomposed
    ## series for car sales and supplied NDMI.  The supplied argument
    ## is the output of `clean.series()`

    ## Example: res <- analyze.series(clean.pixels("newark.txt"))
    
    ## read in sales data
    sales <- read.table("../../data/car-sales.txt", sep="\t")
    names(sales) <- c("month", "year", "val")
    sales$month <- match(sales$month, month.name) # replace month name with number
    data <- merge(sales, ndmi, by=c("year", "month"))

    .toSeries <- function(val.name) {
        ## local function to generate a decomposed time series
        series <- ts(data[[val.name]], frequency = 12, start = c(2000,01))
        return(decompose(series))
    }

    res <- list()
    res$sales <- .toSeries("val")
    res$ndmi <- .toSeries("num")
    return(res)
 }

 plot.corr <- function(res.list, filename="../../data/test.png") {
    ## Plot the correlation between the NDMI index and car sales; save
    ## to supplied file, if given.
    ndmi  <- na.omit(as.matrix(res.list$ndmi$trend))
    sales <- na.omit(as.matrix(res.list$sales$trend))

    ## png(filename)
    plot(ndmi, sales, pch=20, col = "blue",
         xlab = "Metallic index for facility",
         ylab = "Monthly car sales (in thousands)")
    abline(lm(sales ~ ndmi), col="red")
    ## dev.off()
 }

 create.names <- function(ncols) {
    ## returns a list of column names prefaced by `v`
    return(sapply(1:ncols, function(x) { paste("v", x, sep = "") }))
 }

 merge.indices <- function(dir) {
    ## Accepts a collection of file names and returns a single data
    ## frame of merged indices.
    coll <- list.files(dir)
    data.list <- lapply(coll, function(x) { clean.pixels(x, dir) })

    merged <- data.list[1]
    for (d in data.list[2:length(data.list)]) {
        suppressWarnings(merged <- merge(merged, d, by=c("year", "month"), all=TRUE))
        names(merged) <- c("year", "month", create.names(ncol(merged) - 2))
    }

    sorted <- merged[order(merged$year, merged$month), ]
    return(sorted)
 }

 super.index <- function(dir) {
    ## returns a data frame with the average of all NDMI indices for
    ## the supplied files
    data <- merge.indices(dir)
    m <- apply(data[,3:ncol(data)], 1, function(x) { mean(x, na.rm = TRUE) })
    df <- data.frame(year = data$year, month = data$month, num = m)
    return(df)
 }

 pca.analysis <- function(dir) {
    data <- merge.indices(dir)
    ## predict the second principal component
    p2 <- predict(princomp(data[,-(1:2)]))[,1]
    ndmi <- cbind(data[ ,1:2], p2)
    names(ndmi) <- c("year", "month", "num")

    ## reread table TODO: refactor
    sales <- read.table("../../data/car-sales.csv", sep=",")
    names(sales) <- c("month", "year", "val")

    data <- merge(sales, ndmi, by=c("year", "month"))
    ## png("../../data/pca-test.png")
    plot(data$num, data$val, pch=20, col = "blue",
         xlab = "PCA decomp for metallic index",
         ylab = "Monthly car sales (in thousands)")
    abline(lm(data$val ~ data$num), col="red")
    ## dev.off()

    ## This makes sense, after removing time component
    idx <- 1:length(data$val)
    y <- as.numeric(as.character(data$val))
    x <- data$num
    print(summary(lm(y ~ x + idx)))
    
    return(data)
 }

 predict.econ <- function(ndmi.dir, reference.file) {
    ## Compare and graph predictions for NDMI and reference data sets;
    ## an example would be the car facilities and the car sales:
    
    ## ndmi.dir <- "../../data/intermediate"
    ## reference.file <- "../../data/car-sales.csv"

    ## create data frame
    ndmi <- merge.indices(ndmi.dir)
    econ <- read.table(reference.file, sep=",", header = TRUE)
    names(econ) <- c("month", "year", "econ")
    data <- merge(econ, ndmi, by=c("year", "month"))

    .clean <- function(coll) {
        ## ensure that all data is numeric, hard force
        return(as.numeric(as.character(coll)))
    }

    ## clean and sort data, cutting off unn
    cleaned <- data.frame(apply(data, 2, .clean))
    sorted <- cleaned[order(cleaned$year, cleaned$month), ]

    ## interpolate sorted data to ensure no missing data.  TODO:
    ## extrapolate data to ensure equal lengths across facilities. If
    ## there are NAs at the beginning or end of the series `na.approx`
    ## drops the values (hence the warming message with `cbind`).
    interp <- apply(sorted, 2, na.approx)
    suppressWarnings(df <- do.call(cbind, interp))

    ## create a random forest model to identify the important
    ## facilities, where `cols` indicate the column names of the
    ## facilities that are more important than the median

    rf <- randomForest(econ ~ ., data = df, mtry = 2,
                       importance = TRUE, do.trace = 100)
    decimp <- sort(rf$importance[,1], dec = TRUE)
    cols <- names(decimp[decimp > median(decimp)])

    ## create an data frame for analysis, with the economic series and
    ## the important facilities. generate predictions.
    analysis.df <- df[,c("econ", cols)]
    m <- lm(econ ~ ., data = data.frame(analysis.df))
    preds <- predict(m)

    return(list(year = data$year, month = data$month,
                econ = interp$econ, prediction = preds, rf = rf))

 }

 grab.name <- function(s) {
    ## accepts a file path and returns the name of the file, sans file
    ## path or extension
    fname <- tail(strsplit(s, "/")[[1]], n = 1)
    name <- strsplit(fname, "\\.")[[1]][1]
    return(name)
 }

 graph.predictions <- function(ndmi.dir, reference.file) {
    ## accepts the results list from `predict.econ` and graphs the
    ## predictions
    res.list <- predict.econ(ndmi.dir, reference.file)
    name <- grab.name(reference.file)

    ## graph time series predictions
    png.name <- paste0("../../img/", name, "-preds.png")
    png(png.name)
    start <- c(min(res.list$year), min(res.list$month))
    econ <- ts(res.list$econ,  frequency = 12, start = start)
    pred <- ts(res.list$prediction, frequency = 12, start = start)
    limits <- c(min(c(econ, pred)), max(c(econ, pred)))
    ts.plot(econ, pred,
            gpars = list(
                col = c("blue", "red"),
                xlab = "",
                ylab = "monthly car sales (in thousands)"
                )
            )
    dev.off()

    ## graph feature importance
    png.name <- paste0("../../img/", name, "-importance.png")
    png(png.name)
    decimp <- sort(rf$importance[,1], dec = TRUE)
    barplot(decimp, border = NA, names.arg = NA, ylab = "importance (MSE)")
    dev.off()

 }
	library(zoo)
	library(stats)
	library(dtw)
	library(timeDate)
	library(lubridate)
	library(reshape)
	library(randomForest)

	clean.pixels <- function(filename, dir) {
	## accepts output from Pavel's script and returns a data frame with
	## month, year, and number of registered pixels; properly sorted and
	## named for merge with car sales data
	filepath <- paste(dir, filename, sep = "/")
	table <- read.table(filepath, sep=",", header = TRUE)
	names(table) <- c("unit", "lon", "lat", "date", "count")
	d <- as.Date(timeDate(table$date))

	pixels <- aggregate(table$count, by = list(month(d), year(d)), FUN = mean)

	# clean final data frame: drop old year and month variables, rename
	# new variables appropriately
	names(pixels) <- c("month", "year", "count")
	return(pixels)
	}

	analyze.series <- function(ndmi) {
	## Accepts a data frame with year, month, and ndmi index (column
	## name "num") and returns a results list with the decomposed
	## series for car sales and supplied NDMI. The supplied argument
	## is the output of `clean.series()`

	## Example: res <- analyze.series(clean.pixels("newark.txt"))

	## read in sales data
	sales <- read.table("../../data/car-sales.txt", sep="\t")
	names(sales) <- c("month", "year", "val")
	sales$month <- match(sales$month, month.name) # replace month name with number
	data <- merge(sales, ndmi, by=c("year", "month"))

	.toSeries <- function(val.name) {
	## local function to generate a decomposed time series
	series <- ts(data[[val.name]], frequency = 12, start = c(2000,01))
	return(decompose(series))
	}

	res <- list()
	res$sales <- .toSeries("val")
	res$ndmi <- .toSeries("num")
	return(res)
	}

	plot.corr <- function(res.list, filename="../../data/test.png") {
	## Plot the correlation between the NDMI index and car sales; save
	## to supplied file, if given.
	ndmi <- na.omit(as.matrix(res.list$ndmi$trend))
	sales <- na.omit(as.matrix(res.list$sales$trend))

	## png(filename)
	plot(ndmi, sales, pch=20, col = "blue",
	xlab = "Metallic index for facility",
	ylab = "Monthly car sales (in thousands)")
	abline(lm(sales ~ ndmi), col="red")
	## dev.off()
	}

	create.names <- function(ncols) {
	## returns a list of column names prefaced by `v`
	return(sapply(1:ncols, function(x) { paste("v", x, sep = "") }))
	}

	merge.indices <- function(dir) {
	## Accepts a collection of file names and returns a single data
	## frame of merged indices.
	coll <- list.files(dir)
	data.list <- lapply(coll, function(x) { clean.pixels(x, dir) })

	merged <- data.list[1]
	for (d in data.list[2:length(data.list)]) {
	suppressWarnings(merged <- merge(merged, d, by=c("year", "month"), all=TRUE))
	names(merged) <- c("year", "month", create.names(ncol(merged) - 2))
	}

	sorted <- merged[order(merged$year, merged$month), ]
	return(sorted)
	}

	super.index <- function(dir) {
	## returns a data frame with the average of all NDMI indices for
	## the supplied files
	data <- merge.indices(dir)
	m <- apply(data[,3:ncol(data)], 1, function(x) { mean(x, na.rm = TRUE) })
	df <- data.frame(year = data$year, month = data$month, num = m)
	return(df)
	}

	pca.analysis <- function(dir) {
	data <- merge.indices(dir)
	## predict the second principal component
	p2 <- predict(princomp(data[,-(1:2)]))[,1]
	ndmi <- cbind(data[ ,1:2], p2)
	names(ndmi) <- c("year", "month", "num")

	## reread table TODO: refactor
	sales <- read.table("../../data/car-sales.csv", sep=",")
	names(sales) <- c("month", "year", "val")

	data <- merge(sales, ndmi, by=c("year", "month"))
	## png("../../data/pca-test.png")
	plot(data$num, data$val, pch=20, col = "blue",
	xlab = "PCA decomp for metallic index",
	ylab = "Monthly car sales (in thousands)")
	abline(lm(data$val ~ data$num), col="red")
	## dev.off()

	## This makes sense, after removing time component
	idx <- 1:length(data$val)
	y <- as.numeric(as.character(data$val))
	x <- data$num
	print(summary(lm(y ~ x + idx)))

	return(data)
	}

	predict.econ <- function(ndmi.dir, reference.file) {
	## Compare and graph predictions for NDMI and reference data sets;
	## an example would be the car facilities and the car sales:

	## ndmi.dir <- "../../data/intermediate"
	## reference.file <- "../../data/car-sales.csv"

	## create data frame
	ndmi <- merge.indices(ndmi.dir)
	econ <- read.table(reference.file, sep=",", header = TRUE)
	names(econ) <- c("month", "year", "econ")
	data <- merge(econ, ndmi, by=c("year", "month"))

	.clean <- function(coll) {
	## ensure that all data is numeric, hard force
	return(as.numeric(as.character(coll)))
	}

	## clean and sort data, cutting off unn
	cleaned <- data.frame(apply(data, 2, .clean))
	sorted <- cleaned[order(cleaned$year, cleaned$month), ]

	## interpolate sorted data to ensure no missing data. TODO:
	## extrapolate data to ensure equal lengths across facilities. If
	## there are NAs at the beginning or end of the series `na.approx`
	## drops the values (hence the warming message with `cbind`).
	interp <- apply(sorted, 2, na.approx)
	suppressWarnings(df <- do.call(cbind, interp))

	## create a random forest model to identify the important
	## facilities, where `cols` indicate the column names of the
	## facilities that are more important than the median

	rf <- randomForest(econ ~ ., data = df, mtry = 2,
	importance = TRUE, do.trace = 100)
	decimp <- sort(rf$importance[,1], dec = TRUE)
	cols <- names(decimp[decimp > median(decimp)])

	## create an data frame for analysis, with the economic series and
	## the important facilities. generate predictions.
	analysis.df <- df[,c("econ", cols)]
	m <- lm(econ ~ ., data = data.frame(analysis.df))
	preds <- predict(m)

	return(list(year = data$year, month = data$month,
	econ = interp$econ, prediction = preds, rf = rf))

	}

	grab.name <- function(s) {
	## accepts a file path and returns the name of the file, sans file
	## path or extension
	fname <- tail(strsplit(s, "/")[[1]], n = 1)
	name <- strsplit(fname, "\\.")[[1]][1]
	return(name)
	}

	graph.predictions <- function(ndmi.dir, reference.file) {
	## accepts the results list from `predict.econ` and graphs the
	## predictions
	res.list <- predict.econ(ndmi.dir, reference.file)
	name <- grab.name(reference.file)

	## graph time series predictions
	png.name <- paste0("../../img/", name, "-preds.png")
	png(png.name)
	start <- c(min(res.list$year), min(res.list$month))
	econ <- ts(res.list$econ, frequency = 12, start = start)
	pred <- ts(res.list$prediction, frequency = 12, start = start)
	limits <- c(min(c(econ, pred)), max(c(econ, pred)))
	ts.plot(econ, pred,
	gpars = list(
	col = c("blue", "red"),
	xlab = "",
	ylab = "monthly car sales (in thousands)"
	)
	)
	dev.off()

	## graph feature importance
	png.name <- paste0("../../img/", name, "-importance.png")
	png(png.name)
	decimp <- sort(rf$importance[,1], dec = TRUE)
	barplot(decimp, border = NA, names.arg = NA, ylab = "importance (MSE)")
	dev.off()

	}