Skip to content

Instantly share code, notes, and snippets.

@primaryobjects
Created April 25, 2016 19:28
Show Gist options
  • Select an option

  • Save primaryobjects/69c65802315f677477a1aa0070dbf265 to your computer and use it in GitHub Desktop.

Select an option

Save primaryobjects/69c65802315f677477a1aa0070dbf265 to your computer and use it in GitHub Desktop.
Flu trend analysis with linear regression and time-series features.
train <- read.csv('flutrain.csv')
test <- read.csv('flutest.csv')
train[order(train$ILI),]
train[order(train$Queries),]
hist(train$ILI)
plot(log(train$ILI), train$Queries)
train$logILI <- log(train$ILI)
fluTrend1 <- lm(logILI ~ Queries, data=train)
val <- cor(train$logILI, train$Queries)
round(val^2, 3) == 0.709
round(log(1/val), 3) == 0.709
round(exp(-0.5*val), 3) == 0.709
pred <- exp(predict(fluTrend1, newdata = test))
results <- cbind(test, pred)
week <- results[11,]
relativeErr <- (week$ILI - week$pred) / week$ILI
SSE <- sum((pred - test$ILI)^2)
RMSE <- sqrt(SSE / nrow(test))
install.packages('zoo')
library(zoo)
# Add a time-series feature by setting the value to be the ILI from 2-weeks prior for each record. Note, the first 2 will have NA (since no prior 2-week value).
train$ILILag2 <- lag(zoo(train$ILI), -2, na.pad=TRUE)
# Create a log value for this new ILI lag value, just like we did above.
train$logILILag2 <- log(train$ILILag2)
plot(log(train$ILILag2), log(train$ILI))
# Even better r-squared value.
fluTrend2 <- lm(logILI ~ Queries + logILILag2, data=train)
test$ILILag2 <- lag(zoo(test$ILI), -2, na.pad=TRUE)
# We can fill in the 2 missing values for the lag by using the last 2 ILI values in the training (since they are sequential).
test$ILILag2[1] <- train$ILI[nrow(train) - 1]
test$ILILag2[2] <- train$ILI[nrow(train)]
test$logILILag2 <- log(test$ILILag2)
pred2 <- exp(predict(fluTrend2, newdata=test))
SSE2 <- sum((pred2 - test$ILI)^2)
RMSE2 <- sqrt(SSE2 / nrow(test))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment