Created
December 20, 2013 19:30
-
-
Save danhammer/8060073 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(zoo) | |
library(stats) | |
library(dtw) | |
library(timeDate) | |
library(lubridate) | |
library(reshape) | |
library(randomForest) | |
clean.pixels <- function(filename, dir) { | |
## accepts output from Pavel's script and returns a data frame with | |
## month, year, and number of registered pixels; properly sorted and | |
## named for merge with car sales data | |
filepath <- paste(dir, filename, sep = "/") | |
table <- read.table(filepath, sep=",", header = TRUE) | |
names(table) <- c("unit", "lon", "lat", "date", "count") | |
d <- as.Date(timeDate(table$date)) | |
pixels <- aggregate(table$count, by = list(month(d), year(d)), FUN = mean) | |
# clean final data frame: drop old year and month variables, rename | |
# new variables appropriately | |
names(pixels) <- c("month", "year", "count") | |
return(pixels) | |
} | |
analyze.series <- function(ndmi) { | |
## Accepts a data frame with year, month, and ndmi index (column | |
## name "num") and returns a results list with the decomposed | |
## series for car sales and supplied NDMI. The supplied argument | |
## is the output of `clean.series()` | |
## Example: res <- analyze.series(clean.pixels("newark.txt")) | |
## read in sales data | |
sales <- read.table("../../data/car-sales.txt", sep="\t") | |
names(sales) <- c("month", "year", "val") | |
sales$month <- match(sales$month, month.name) # replace month name with number | |
data <- merge(sales, ndmi, by=c("year", "month")) | |
.toSeries <- function(val.name) { | |
## local function to generate a decomposed time series | |
series <- ts(data[[val.name]], frequency = 12, start = c(2000,01)) | |
return(decompose(series)) | |
} | |
res <- list() | |
res$sales <- .toSeries("val") | |
res$ndmi <- .toSeries("num") | |
return(res) | |
} | |
plot.corr <- function(res.list, filename="../../data/test.png") { | |
## Plot the correlation between the NDMI index and car sales; save | |
## to supplied file, if given. | |
ndmi <- na.omit(as.matrix(res.list$ndmi$trend)) | |
sales <- na.omit(as.matrix(res.list$sales$trend)) | |
## png(filename) | |
plot(ndmi, sales, pch=20, col = "blue", | |
xlab = "Metallic index for facility", | |
ylab = "Monthly car sales (in thousands)") | |
abline(lm(sales ~ ndmi), col="red") | |
## dev.off() | |
} | |
create.names <- function(ncols) { | |
## returns a list of column names prefaced by `v` | |
return(sapply(1:ncols, function(x) { paste("v", x, sep = "") })) | |
} | |
merge.indices <- function(dir) { | |
## Accepts a collection of file names and returns a single data | |
## frame of merged indices. | |
coll <- list.files(dir) | |
data.list <- lapply(coll, function(x) { clean.pixels(x, dir) }) | |
merged <- data.list[1] | |
for (d in data.list[2:length(data.list)]) { | |
suppressWarnings(merged <- merge(merged, d, by=c("year", "month"), all=TRUE)) | |
names(merged) <- c("year", "month", create.names(ncol(merged) - 2)) | |
} | |
sorted <- merged[order(merged$year, merged$month), ] | |
return(sorted) | |
} | |
super.index <- function(dir) { | |
## returns a data frame with the average of all NDMI indices for | |
## the supplied files | |
data <- merge.indices(dir) | |
m <- apply(data[,3:ncol(data)], 1, function(x) { mean(x, na.rm = TRUE) }) | |
df <- data.frame(year = data$year, month = data$month, num = m) | |
return(df) | |
} | |
pca.analysis <- function(dir) { | |
data <- merge.indices(dir) | |
## predict the second principal component | |
p2 <- predict(princomp(data[,-(1:2)]))[,1] | |
ndmi <- cbind(data[ ,1:2], p2) | |
names(ndmi) <- c("year", "month", "num") | |
## reread table TODO: refactor | |
sales <- read.table("../../data/car-sales.csv", sep=",") | |
names(sales) <- c("month", "year", "val") | |
data <- merge(sales, ndmi, by=c("year", "month")) | |
## png("../../data/pca-test.png") | |
plot(data$num, data$val, pch=20, col = "blue", | |
xlab = "PCA decomp for metallic index", | |
ylab = "Monthly car sales (in thousands)") | |
abline(lm(data$val ~ data$num), col="red") | |
## dev.off() | |
## This makes sense, after removing time component | |
idx <- 1:length(data$val) | |
y <- as.numeric(as.character(data$val)) | |
x <- data$num | |
print(summary(lm(y ~ x + idx))) | |
return(data) | |
} | |
predict.econ <- function(ndmi.dir, reference.file) { | |
## Compare and graph predictions for NDMI and reference data sets; | |
## an example would be the car facilities and the car sales: | |
## ndmi.dir <- "../../data/intermediate" | |
## reference.file <- "../../data/car-sales.csv" | |
## create data frame | |
ndmi <- merge.indices(ndmi.dir) | |
econ <- read.table(reference.file, sep=",", header = TRUE) | |
names(econ) <- c("month", "year", "econ") | |
data <- merge(econ, ndmi, by=c("year", "month")) | |
.clean <- function(coll) { | |
## ensure that all data is numeric, hard force | |
return(as.numeric(as.character(coll))) | |
} | |
## clean and sort data, cutting off unn | |
cleaned <- data.frame(apply(data, 2, .clean)) | |
sorted <- cleaned[order(cleaned$year, cleaned$month), ] | |
## interpolate sorted data to ensure no missing data. TODO: | |
## extrapolate data to ensure equal lengths across facilities. If | |
## there are NAs at the beginning or end of the series `na.approx` | |
## drops the values (hence the warming message with `cbind`). | |
interp <- apply(sorted, 2, na.approx) | |
suppressWarnings(df <- do.call(cbind, interp)) | |
## create a random forest model to identify the important | |
## facilities, where `cols` indicate the column names of the | |
## facilities that are more important than the median | |
rf <- randomForest(econ ~ ., data = df, mtry = 2, | |
importance = TRUE, do.trace = 100) | |
decimp <- sort(rf$importance[,1], dec = TRUE) | |
cols <- names(decimp[decimp > median(decimp)]) | |
## create an data frame for analysis, with the economic series and | |
## the important facilities. generate predictions. | |
analysis.df <- df[,c("econ", cols)] | |
m <- lm(econ ~ ., data = data.frame(analysis.df)) | |
preds <- predict(m) | |
return(list(year = data$year, month = data$month, | |
econ = interp$econ, prediction = preds, rf = rf)) | |
} | |
grab.name <- function(s) { | |
## accepts a file path and returns the name of the file, sans file | |
## path or extension | |
fname <- tail(strsplit(s, "/")[[1]], n = 1) | |
name <- strsplit(fname, "\\.")[[1]][1] | |
return(name) | |
} | |
graph.predictions <- function(ndmi.dir, reference.file) { | |
## accepts the results list from `predict.econ` and graphs the | |
## predictions | |
res.list <- predict.econ(ndmi.dir, reference.file) | |
name <- grab.name(reference.file) | |
## graph time series predictions | |
png.name <- paste0("../../img/", name, "-preds.png") | |
png(png.name) | |
start <- c(min(res.list$year), min(res.list$month)) | |
econ <- ts(res.list$econ, frequency = 12, start = start) | |
pred <- ts(res.list$prediction, frequency = 12, start = start) | |
limits <- c(min(c(econ, pred)), max(c(econ, pred))) | |
ts.plot(econ, pred, | |
gpars = list( | |
col = c("blue", "red"), | |
xlab = "", | |
ylab = "monthly car sales (in thousands)" | |
) | |
) | |
dev.off() | |
## graph feature importance | |
png.name <- paste0("../../img/", name, "-importance.png") | |
png(png.name) | |
decimp <- sort(rf$importance[,1], dec = TRUE) | |
barplot(decimp, border = NA, names.arg = NA, ylab = "importance (MSE)") | |
dev.off() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment