Last active
April 12, 2016 13:29
-
-
Save Gedevan-Aleksizde/93ba87d1ede8ff49fdfbde17ce9867d6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(tidyr) # ver. 0.3.1. | |
| library(dplyr) # ver. 0.4.3 | |
| library(rstan) # ver. 2.9.0 | |
| library(forecast) # ver. 6.2 | |
| library(zoo) # ver. 1.7-11 | |
| library(gdata) # ver. 2.13.3 | |
| library(loo) # ver. 0.1.6 | |
| # read the xls file | |
| # http://www.stat.go.jp/data/chouki/zuhyou/02-05.xls | |
| temp <- read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=5, header=T, dec=",") | |
| header <- as.character(read.xls("~/Documents/blog/20160327_VARIMA2/02-05.xls", skip=3, nrows=1, header=F, stringsAsFactors =F)) | |
| colnames(temp) <- ifelse(header != "NA", header, 1:length(header)*10) | |
| colnames(temp)[2] <- "year" | |
| temp <- temp %>% dplyr::select(-starts_with("Japan"), -ends_with("0")) | |
| colnames(temp)[ncol(temp)] <- "Okinawa" | |
| temp <- temp %>% mutate_each(funs(gsub(pattern=",|[a-z]|)|-", replacement="", .)) ) %>% mutate_each(funs(as.integer)) %>% filter(!is.na(year), year >=1950) | |
| ts.pop <- ts(as.matrix(temp[,-1]), start = temp$year[1]) | |
| # interpolate missing values | |
| ts.pop <- ts(na.approx(zoo(ts.pop)), start=temp$year[1]) | |
| df.pop <- as.data.frame(ts.pop) | |
| row.names(df.pop) <- temp$year | |
| # plot correlograms (too many) | |
| acf(ts.pop) | |
| temp <- df.pop | |
| temp$t <- row.names(temp) | |
| for( i in seq(from=1, to=47, by=4)){ | |
| print( | |
| ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% unique(prefecture)[i:(i+5) ]) ) + | |
| geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1) | |
| ) | |
| } | |
| plot.stan.pred <- function(data, stanout, p, d, q, span=NULL, time.offset=NULL, ser=NULL){ | |
| Time <- nrow(data) | |
| T_end <- dim(rstan::extract(stanout, "y_pred")$y_pred)[2] | |
| T_forecast <- T_end - Time + d; | |
| N <- ncol(data) | |
| if(is.null(time.offset) ) time.offset <- as.integer(row.names(data)[1]) | |
| if( is.na(time.offset) | is.nan(time.offset) ) time.offset <- 0 | |
| if (is.null(span) ) span <- 1:(Time + nrow(y_pred)) | |
| y_pred <- data.frame() | |
| for ( i in 1:N){ | |
| m.pred <- rstan::extract(stanout, "y_pred")$y_pred[,(Time-d+1):T_end,i] | |
| # arima(p,1,d) case | |
| if( d==1){ | |
| m.pred[,1] <- m.pred[,1] + data[nrow(data),i] | |
| for( t in 2:ncol(m.pred)) | |
| m.pred[,t] <- m.pred[,t] + m.pred[,t-1] | |
| } | |
| # pred. interval | |
| temp <- data.frame(t = Time-d +1:T_forecast, | |
| series = rep(i, T_forecast), | |
| y90_l = apply(m.pred, 2, quantile, probs=.05, na.rm=T), | |
| y90_u = apply(m.pred, 2, quantile, probs=.95, na.rm=T), | |
| y80_l = apply(m.pred, 2, quantile, probs=.1, na.rm=T), | |
| y80_u = apply(m.pred, 2, quantile, probs=.9, na.rm=T), | |
| y_med = apply(m.pred, 2, median), | |
| y_mean = apply(m.pred, 2, mean, rm.na=T) | |
| ) | |
| y_pred <- rbind(y_pred, temp) | |
| } | |
| temp <- data.frame(t=1:Time, data) | |
| colnames(temp)[1+1:N] <- seq(1:N) | |
| # transpose y_* in single column and join | |
| y_pred <- temp %>% gather(key=series, value=y_mean, -t) %>% mutate(series=as.integer(series)) %>% bind_rows(y_pred) | |
| y_pred <- y_pred %>% filter(t %in% span) %>% mutate(t=t+time.offset) | |
| # merge with header | |
| df.header <- data.frame( series=1:N, name=colnames(data)) | |
| y_pred <- y_pred %>% left_join(y = df.header, by = "series") %>% mutate(series=as.character(name)) | |
| if (!is.null(ser)){ | |
| y_pred <- y_pred %>% filter(series %in% ser) | |
| } | |
| ggplot(y_pred) + geom_line(aes(x=t, y=y_mean)) + geom_line(aes(x=t, y_med), linetype=2) + | |
| geom_ribbon(aes(x=t, ymin=y90_l, ymax=y90_u), alpha=.2, fill="blue") + geom_ribbon(aes(x=t, ymin=y80_l, ymax=y80_u), alpha=.5, fill="grey") + | |
| labs(title=paste0("ARMA(", p, ",", d, ",", q, ") Forecasts by MCMC"), y="y") + | |
| scale_y_continuous(labels = function(x){formatC(x, format="d", big.mark = "," )} ) + facet_wrap(~series) | |
| } | |
| ### estimate by stan ### | |
| rstan_options(auto_write = TRUE) | |
| options(mc.cores = parallel::detectCores()) | |
| stan.varmapq <- stan_model(file="~/Documents/blog/20160327_VARIMA2/varma.stan") # compile | |
| stan.output <- function(data, T_forecast, p, d, q, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){ | |
| seq.span <- as.integer(row.names(data)) | |
| if( d > 0) data.d <- diff(ts(data), differences = d) | |
| else data.d <- data | |
| res.stan.pop <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=p, q=q, d=d), chain=chain ) | |
| w <- waic(extract_log_lik(res.stan.pop)) | |
| g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q, | |
| ser= series, time.offset = 1950 ) | |
| print(g) | |
| print(w) | |
| return( list( | |
| stan.out=res.stan.pop, | |
| graph= g, | |
| waic = w | |
| )) | |
| } | |
| # VARIMA(1,0,1) | |
| result.1 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 0, q = 1) | |
| print(result.1$stan.out, pars="Psi") | |
| ggplot(temp %>% gather(key=prefecture, value=pop, -t) %>% filter(prefecture %in% c("Tokyo", "Osaka", "Hokkaido") ) ) + | |
| geom_line(aes(x=t, y=pop, group=prefecture)) + facet_wrap(~prefecture, ncol = 1) | |
| # VARMA(1,1,1) | |
| plot(diff(ts.pop[,1:10])) | |
| plot(diff(ts.pop[,11:20])) | |
| plot(diff(ts.pop[,21:30])) | |
| plot(diff(ts.pop[,31:40])) | |
| plot(diff(ts.pop[,41:47])) | |
| result.2 <- stan.output(data = df.pop, T_forecast = 30, p = 1, d = 1, q = 1) | |
| # semi-auto (p,q) selection version | |
| stan.output.semiauto <- function(data, T_forecast, d=1, chain=1, series=c("Hokkaido", "Tokyo", "Osaka") ){ | |
| seq.span <- as.integer(row.names(data)) | |
| if( d > 0) data.d <- diff(ts(data), differences = d) | |
| else data.d <- data | |
| w <- Inf | |
| for ( pq in list(c(1,1), c(2,2), c(1,0), c(0,1), c(0,0) ) ){ | |
| print(paste("p=", pq[[1]], ",", "d=", d, "q=", pq[[2]] )) | |
| res.temp <- sampling(stan.varmapq, data=list(T=nrow(data.d), N=ncol(data.d), y=data.d, T_forecast=T_forecast, p=pq[[1]], q=pq[[2]], d=d), chain=1 ) | |
| w.temp <- waic(extract_log_lik(res.temp)) | |
| if( w.temp$waic < w) { | |
| w <- w.temp$waic | |
| w.info <- w.temp | |
| p <- pq[[1]] | |
| q <- pq[[2]] | |
| res.stan <- res.temp | |
| } | |
| print(w.temp) | |
| } | |
| w <- w.info | |
| g <- plot.stan.pred(data = data, stanout = res.stan.pop, p=p, d=d, q=q, | |
| ser= series, time.offset = 1950 ) | |
| print(g) | |
| print(paste("selected:", "p=", p, "d=", d, "q=", q)) | |
| print(w) | |
| return( list( | |
| stan.out=res.stan.pop, | |
| graph= g, | |
| waic = w | |
| )) | |
| } | |
| # Only Kantou district | |
| df.pop.k <- df.pop %>% dplyr::select(Tokyo, Kanagawa, Saitama, Chiba, Ibaraki, Tochigi, Gumma) | |
| result.k <- stan.output.semiauto(data = df.pop.k, T_forecast = 30, d = 1, series = NULL) | |
| result.all.1 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 0, series = NULL) | |
| result.all.0 <- stan.output.semiauto(data = df.pop, T_forecast = 30, d = 1, series = NULL) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment