Created
April 25, 2016 19:28
-
-
Save primaryobjects/69c65802315f677477a1aa0070dbf265 to your computer and use it in GitHub Desktop.
Flu trend analysis with linear regression and time-series features.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| train <- read.csv('flutrain.csv') | |
| test <- read.csv('flutest.csv') | |
| train[order(train$ILI),] | |
| train[order(train$Queries),] | |
| hist(train$ILI) | |
| plot(log(train$ILI), train$Queries) | |
| train$logILI <- log(train$ILI) | |
| fluTrend1 <- lm(logILI ~ Queries, data=train) | |
| val <- cor(train$logILI, train$Queries) | |
| round(val^2, 3) == 0.709 | |
| round(log(1/val), 3) == 0.709 | |
| round(exp(-0.5*val), 3) == 0.709 | |
| pred <- exp(predict(fluTrend1, newdata = test)) | |
| results <- cbind(test, pred) | |
| week <- results[11,] | |
| relativeErr <- (week$ILI - week$pred) / week$ILI | |
| SSE <- sum((pred - test$ILI)^2) | |
| RMSE <- sqrt(SSE / nrow(test)) | |
| install.packages('zoo') | |
| library(zoo) | |
| # Add a time-series feature by setting the value to be the ILI from 2-weeks prior for each record. Note, the first 2 will have NA (since no prior 2-week value). | |
| train$ILILag2 <- lag(zoo(train$ILI), -2, na.pad=TRUE) | |
| # Create a log value for this new ILI lag value, just like we did above. | |
| train$logILILag2 <- log(train$ILILag2) | |
| plot(log(train$ILILag2), log(train$ILI)) | |
| # Even better r-squared value. | |
| fluTrend2 <- lm(logILI ~ Queries + logILILag2, data=train) | |
| test$ILILag2 <- lag(zoo(test$ILI), -2, na.pad=TRUE) | |
| # We can fill in the 2 missing values for the lag by using the last 2 ILI values in the training (since they are sequential). | |
| test$ILILag2[1] <- train$ILI[nrow(train) - 1] | |
| test$ILILag2[2] <- train$ILI[nrow(train)] | |
| test$logILILag2 <- log(test$ILILag2) | |
| pred2 <- exp(predict(fluTrend2, newdata=test)) | |
| SSE2 <- sum((pred2 - test$ILI)^2) | |
| RMSE2 <- sqrt(SSE2 / nrow(test)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment