Created
November 11, 2013 10:32
-
-
Save IronistM/7411127 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
suppressMessages(library(forecast)) | |
data<-read.csv( file('stdin') ) | |
anomaly_detection<-function(data){ | |
seasonality<-48 | |
data_series<-ts(data$count,frequency=seasonality) | |
train_start<-1 ## train on 1 month of data | |
train_end<-ifelse(length(data_series)%%2==0,length(data_series)/2,(length(data_series)-1)/2) | |
test_start<-train_end | |
test_end<-length(data_series)-1 ## do not include last period of data since it's potentially not complete | |
# find best model | |
f_prev<-forecast_model(data_series,test_start,test_end-1,train_start,train_end) | |
## fit model to every point except last one to check if previous value is an anomaly | |
## if so, do not use it when forecasting next point | |
previous<-data_series[test_end] | |
## ignore last point if it was an anomaly when fitting model | |
is_anomaly_previous<-ifelse(previous>f_prev$upper[2]|previous<f_prev$lower[2],TRUE,FALSE) | |
if(is_anomaly_previous){ | |
## if last point was anomalous, set it equal to the point at the previous season | |
data_series[test_end]<-data_series[test_end-seasonality] | |
} | |
## similar call as above, except now data_series is not using the anomalous value | |
## similar call as above, except now data_series is using the latest value in data_series to forecast | |
f_cur<-forecast_model(data_series,test_start,test_end,train_start,train_end) | |
current<-data_series[test_end+1] | |
is_anomaly_current<-ifelse(current>f_cur$upper[2]|current<f_cur$lower[2],TRUE,FALSE) | |
return(c(data[test_end+1,1],current,is_anomaly_current)) | |
} | |
# function to select best fitting forecast model | |
forecast_model<-function(data_series,test_start,test_end,train_start,train_end){ | |
# train model on training set | |
arima_model<-auto.arima(data_series[train_start:train_end]) | |
# Set up a HoltWinters model. If the time series isn't long enough, revert to the arima model | |
HW_model<-tryCatch(HoltWinters(ts(data_series[train_start:train_end],frequency=48)), | |
error = function (e) arima_model, | |
warning = function(w) arima_model) | |
# forecast model on next 6 points | |
# this is the "testing set" | |
arima_model_forecast<-forecast(arima_model,h=6) | |
HW_model_forecast<-forecast(HW_model,h=6) | |
# compare forecasted values to actual values to determine accuracy | |
arima_acc<-accuracy(arima_model_forecast,data_series[test_start:(test_start+6)]) | |
HW_acc<-accuracy(HW_model_forecast,data_series[test_start:(test_start+6)]) | |
# model error is average of Root Mean Square Error and Mean Absolute Percentage Error | |
# select model with lowest level of error | |
arima_err<-(arima_acc[2,2]+arima_acc[2,5])/2 | |
HW_err<-(HW_acc[2,2]+HW_acc[2,5])/2 | |
# fit model to entire testing set | |
if(arima_err<=HW_err){ | |
f.model<-auto.arima(data_series[test_start:test_end]) | |
} else { | |
f.model<-HoltWinters(ts(data_series[test_start:test_end],frequency=48)) | |
} | |
# forecast next point using best model | |
f<-forecast(f.model,h=1,level=c(0.8,0.95,0.99)) | |
return(f) | |
} | |
cat( anomaly_detection(data), sep="," ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment