Skip to content

Instantly share code, notes, and snippets.

@danhammer
Created December 20, 2013 19:30
Show Gist options
  • Save danhammer/8060073 to your computer and use it in GitHub Desktop.
Save danhammer/8060073 to your computer and use it in GitHub Desktop.
library(zoo)
library(stats)
library(dtw)
library(timeDate)
library(lubridate)
library(reshape)
library(randomForest)
clean.pixels <- function(filename, dir) {
## accepts output from Pavel's script and returns a data frame with
## month, year, and number of registered pixels; properly sorted and
## named for merge with car sales data
filepath <- paste(dir, filename, sep = "/")
table <- read.table(filepath, sep=",", header = TRUE)
names(table) <- c("unit", "lon", "lat", "date", "count")
d <- as.Date(timeDate(table$date))
pixels <- aggregate(table$count, by = list(month(d), year(d)), FUN = mean)
# clean final data frame: drop old year and month variables, rename
# new variables appropriately
names(pixels) <- c("month", "year", "count")
return(pixels)
}
analyze.series <- function(ndmi) {
## Accepts a data frame with year, month, and ndmi index (column
## name "num") and returns a results list with the decomposed
## series for car sales and supplied NDMI. The supplied argument
## is the output of `clean.series()`
## Example: res <- analyze.series(clean.pixels("newark.txt"))
## read in sales data
sales <- read.table("../../data/car-sales.txt", sep="\t")
names(sales) <- c("month", "year", "val")
sales$month <- match(sales$month, month.name) # replace month name with number
data <- merge(sales, ndmi, by=c("year", "month"))
.toSeries <- function(val.name) {
## local function to generate a decomposed time series
series <- ts(data[[val.name]], frequency = 12, start = c(2000,01))
return(decompose(series))
}
res <- list()
res$sales <- .toSeries("val")
res$ndmi <- .toSeries("num")
return(res)
}
plot.corr <- function(res.list, filename="../../data/test.png") {
## Plot the correlation between the NDMI index and car sales; save
## to supplied file, if given.
ndmi <- na.omit(as.matrix(res.list$ndmi$trend))
sales <- na.omit(as.matrix(res.list$sales$trend))
## png(filename)
plot(ndmi, sales, pch=20, col = "blue",
xlab = "Metallic index for facility",
ylab = "Monthly car sales (in thousands)")
abline(lm(sales ~ ndmi), col="red")
## dev.off()
}
create.names <- function(ncols) {
## returns a list of column names prefaced by `v`
return(sapply(1:ncols, function(x) { paste("v", x, sep = "") }))
}
merge.indices <- function(dir) {
## Accepts a collection of file names and returns a single data
## frame of merged indices.
coll <- list.files(dir)
data.list <- lapply(coll, function(x) { clean.pixels(x, dir) })
merged <- data.list[1]
for (d in data.list[2:length(data.list)]) {
suppressWarnings(merged <- merge(merged, d, by=c("year", "month"), all=TRUE))
names(merged) <- c("year", "month", create.names(ncol(merged) - 2))
}
sorted <- merged[order(merged$year, merged$month), ]
return(sorted)
}
super.index <- function(dir) {
## returns a data frame with the average of all NDMI indices for
## the supplied files
data <- merge.indices(dir)
m <- apply(data[,3:ncol(data)], 1, function(x) { mean(x, na.rm = TRUE) })
df <- data.frame(year = data$year, month = data$month, num = m)
return(df)
}
pca.analysis <- function(dir) {
data <- merge.indices(dir)
## predict the second principal component
p2 <- predict(princomp(data[,-(1:2)]))[,1]
ndmi <- cbind(data[ ,1:2], p2)
names(ndmi) <- c("year", "month", "num")
## reread table TODO: refactor
sales <- read.table("../../data/car-sales.csv", sep=",")
names(sales) <- c("month", "year", "val")
data <- merge(sales, ndmi, by=c("year", "month"))
## png("../../data/pca-test.png")
plot(data$num, data$val, pch=20, col = "blue",
xlab = "PCA decomp for metallic index",
ylab = "Monthly car sales (in thousands)")
abline(lm(data$val ~ data$num), col="red")
## dev.off()
## This makes sense, after removing time component
idx <- 1:length(data$val)
y <- as.numeric(as.character(data$val))
x <- data$num
print(summary(lm(y ~ x + idx)))
return(data)
}
predict.econ <- function(ndmi.dir, reference.file) {
## Compare and graph predictions for NDMI and reference data sets;
## an example would be the car facilities and the car sales:
## ndmi.dir <- "../../data/intermediate"
## reference.file <- "../../data/car-sales.csv"
## create data frame
ndmi <- merge.indices(ndmi.dir)
econ <- read.table(reference.file, sep=",", header = TRUE)
names(econ) <- c("month", "year", "econ")
data <- merge(econ, ndmi, by=c("year", "month"))
.clean <- function(coll) {
## ensure that all data is numeric, hard force
return(as.numeric(as.character(coll)))
}
## clean and sort data, cutting off unn
cleaned <- data.frame(apply(data, 2, .clean))
sorted <- cleaned[order(cleaned$year, cleaned$month), ]
## interpolate sorted data to ensure no missing data. TODO:
## extrapolate data to ensure equal lengths across facilities. If
## there are NAs at the beginning or end of the series `na.approx`
## drops the values (hence the warming message with `cbind`).
interp <- apply(sorted, 2, na.approx)
suppressWarnings(df <- do.call(cbind, interp))
## create a random forest model to identify the important
## facilities, where `cols` indicate the column names of the
## facilities that are more important than the median
rf <- randomForest(econ ~ ., data = df, mtry = 2,
importance = TRUE, do.trace = 100)
decimp <- sort(rf$importance[,1], dec = TRUE)
cols <- names(decimp[decimp > median(decimp)])
## create an data frame for analysis, with the economic series and
## the important facilities. generate predictions.
analysis.df <- df[,c("econ", cols)]
m <- lm(econ ~ ., data = data.frame(analysis.df))
preds <- predict(m)
return(list(year = data$year, month = data$month,
econ = interp$econ, prediction = preds, rf = rf))
}
grab.name <- function(s) {
## accepts a file path and returns the name of the file, sans file
## path or extension
fname <- tail(strsplit(s, "/")[[1]], n = 1)
name <- strsplit(fname, "\\.")[[1]][1]
return(name)
}
graph.predictions <- function(ndmi.dir, reference.file) {
## accepts the results list from `predict.econ` and graphs the
## predictions
res.list <- predict.econ(ndmi.dir, reference.file)
name <- grab.name(reference.file)
## graph time series predictions
png.name <- paste0("../../img/", name, "-preds.png")
png(png.name)
start <- c(min(res.list$year), min(res.list$month))
econ <- ts(res.list$econ, frequency = 12, start = start)
pred <- ts(res.list$prediction, frequency = 12, start = start)
limits <- c(min(c(econ, pred)), max(c(econ, pred)))
ts.plot(econ, pred,
gpars = list(
col = c("blue", "red"),
xlab = "",
ylab = "monthly car sales (in thousands)"
)
)
dev.off()
## graph feature importance
png.name <- paste0("../../img/", name, "-importance.png")
png(png.name)
decimp <- sort(rf$importance[,1], dec = TRUE)
barplot(decimp, border = NA, names.arg = NA, ylab = "importance (MSE)")
dev.off()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment