Last active
March 9, 2022 14:21
-
-
Save hongyuanjia/ffafd318bf0403a8e7836c1711d3e95c to your computer and use it in GitHub Desktop.
Initialize input data for Stan
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Initialize input data for Stan | |
#' | |
#' @param field A data.frame that contains field-measured data | |
#' | |
#' @param computed A data.frame that contains computed (simulated) data | |
#' | |
#' @param designed A data.frame that contains designed data for prediction. | |
#' Default is set to the same input as `computed.` | |
#' | |
#' @param inputs,outputs,params One or more unquoted expressions separated by | |
#' commas. Normally unquoted column names for input, output and | |
#' calibration parameter, respectively. Will directly pass to [dplyr:: | |
#' select]. | |
#' | |
#' @return A named list of 3 elements: | |
#' | |
#' - `input`: A list of 3 elements. They are all min-max normalized based on the | |
#' combination of raw observed and computed input. | |
#' - `observed`: A tibble with processed observed input data | |
#' - `computed`: A tibble with processed computed input data | |
#' - `designed`: A tibble with processed designed input data | |
#' - `output`: A list of 2 elements. They are all standarized to [0, 1]. | |
#' - `observed`: A tibble with processed observed output data | |
#' - `computed`: A tibble with processed computed output data | |
#' - `param`: A tibble with processed data of calibration parameters. The data | |
#' has been min-max normalized. | |
#' | |
#' @examples | |
#' | |
#' init_stan_data( | |
#' field = data_field, computed = data_comp, | |
#' inputs = c(tdb, rh, solar_rad), | |
#' outputs = c(total_elec, heating_gas, cooling_elec), | |
#' params = c(tc1, tc2, tc3) | |
#' ) | |
#' | |
init_stan_data <- function(field, computed, designed = computed, inputs, outputs, params) { | |
# Instead of using number or column index of input, output or parameters, I | |
# do suggest to use column names. | |
# | |
# 1. The names give you the basic information about the data. | |
# | |
# 2. And also using names can avoid mistakes when the numbers or positions | |
# of input data column changes but you did not update the index. | |
# Instead of using 'y', 'xf', 'xc', etc. that do not contain any meaning, I | |
# suggest to use names that are more meaningful, e.g. 'out_obs', 'in_obs'. | |
# observed inputs | |
in_obs <- field %>% dplyr::select({{inputs}}) | |
# computed inputs | |
in_sim <- computed %>% select({{inputs}}) | |
# designed inputs for predictions | |
in_pred <- designed %>% select({{inputs}}) | |
# observed outputs | |
out_obs <- field %>% dplyr::select({{outputs}}) | |
# computed outputs | |
out_sim <- computed %>% dplyr::select({{outputs}}) | |
# calibration parameters | |
par <- computed %>% dplyr::select({{params}}) | |
# min-max normalize observed, computed inputs and designed inputs | |
# NOTE: Here the normalization is based on the min and max of combined | |
# observed and computed inputs. It becomes a little verbose to use | |
# dplyr syntax. Using data.table will make the code much more cleaner. | |
in_comb <- dplyr::bind_rows(in_obs, in_sim) | |
in_comb_min <- in_comb %>% dplyr::summarise( | |
dplyr::across(dplyr::everything(), min, na.rm = TRUE) | |
) | |
in_comb_max <- in_comb %>% dplyr::summarise( | |
dplyr::across(dplyr::everything(), max, na.rm = TRUE) | |
) | |
in_obs_norm <- minmax_norm_df(in_obs, in_comb_min, in_comb_max) | |
in_sim_norm <- minmax_norm_df(in_sim, in_comb_min, in_comb_max) | |
in_pred_norm <- minmax_norm_df(in_pred, in_comb_min, in_comb_max) | |
# min-max normalize calibration parameters | |
par_norm <- par %>% dplyr::mutate( | |
dplyr::across( | |
dplyr::everything(), | |
~minmax_norm(., min(., na.rm = TRUE), max(., na.rm = TRUE)) | |
) | |
) | |
# standardize observed and computed outputs | |
out_sim_std <- out_sim %>% dplyr::mutate( | |
dplyr::across(dplyr::everything(), zscore_norm) | |
) | |
out_obs_std <- out_obs %>% dplyr::mutate( | |
dplyr::across(dplyr::everything(), zscore_norm) | |
) | |
# create data as list for input to Stan | |
list( | |
input = list( | |
observed = in_obs_norm, | |
computed = in_sim_norm, | |
designed = in_pred_norm | |
), | |
output = list( | |
observed = out_obs_std, | |
computed = out_sim_std | |
), | |
param = par_norm | |
) | |
} | |
`%>%` <- magrittr::`%>%` | |
zscore_norm <- function(x, na.rm = TRUE) { | |
(x - mean(x, na.rm = na.rm)) / sd(x, na.rm = na.rm) | |
} | |
minmax_norm <- function(x, min, max) { | |
(x - min) / (max - min) | |
} | |
minmax_norm_df <- function(data, min, max) { | |
purrr::map_dfc( | |
setNames(names(data), names(data)), | |
~minmax_norm(data[[.]], min[[.]], max[[.]]) | |
) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example workflow