Created
April 15, 2020 08:14
-
-
Save philipp-baumann/adabf863b4da54ebf20b1cca661d83d1 to your computer and use it in GitHub Desktop.
simplerspec: Average spectra in list-column by entries in grouping column
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @title Average spectra in list-column by entries in grouping column | |
#' @description Average spectra in list-column of spectra tibble (`spc_tbl`) by | |
#' groups given in group column. | |
#' @param spc_tbl Tibble data frame containing at least the grouping column | |
#' given in argument `by` and input spectra given in list-column `column_in`. | |
#' @param by Character vector of length 1L or name/symbol that specifies the | |
#' column by which groups of spectra are averaged. Default is `"sample_id"`. | |
#' @param column_in Character vector of length 1L or or name/symbol that | |
#' specifies the list-column that contains the inputs spectra to be averaged. | |
#' Default is `"spc_rs"`, which are resampled spectra (i.e., resulting after | |
#' preceding `resample_spc()` step). | |
#' @return Spectra tibble data frame (class `"tbl_df"`, `"tbl"`, `"data.frame"`) | |
#' with a new list-column of column name `"spc_mean"` at the last position, | |
#' containing mean spectra with identical row replicates within the same | |
#' `by`-group. | |
#' @details For memory efficiency and subsequent modeling, consider slicing the | |
#' extra row copies of `spc_mean` resulting from `average_spc()` for example by | |
#' * `split(x = spc_tbl, f = spc_tbl$<by>) %>% lapply(., function(x) x x[1, ]) %>% do.call(., rbind)` | |
#' * `dplyr::group_by(spc_tbl, <by>) %>% dplyr::slice(1L)` | |
#' @import stats | |
#' @importFrom data.table data.table rbindlist setkey setDT := .SD | |
#' @importFrom rlang ensym quo_name | |
#' @export | |
average_spc <- function(spc_tbl, by = "sample_id", column_in = "spc_rs") { | |
# Avoid R CMD check note: `"...no visible binding for global variable..."` | |
spc_rs <- sample_id <- id <- NULL | |
# Quote the symbol or the string supplied by the second and third argument | |
column_in <- rlang::enquo(column_in) | |
by <- rlang::enquo(by) | |
# Combine rows of all resampled spectra in one data.table | |
spc <- data.table::rbindlist(dplyr::pull(spc_tbl, !!column_in)) | |
# Add `id` group column to input spectra | |
spc[, id := dplyr::pull(spc_tbl, !!by)] # spc_tbl[, by][[by]] | |
# Average spectra, use `id` column as index for grouping | |
data.table::setkey(spc, id) | |
spc_mean <- spc[, lapply(.SD, mean), by = id] | |
# Create new vector of group ID values from column `id` | |
group_id_mean <- spc_mean[, id] | |
# Delete sample_id column in data.table | |
spc_mean_noid <- spc_mean[, id := NULL] | |
# Create list of averaged spectra, one spectrum is one data.table | |
# Use best performing alternative: | |
# https://github.com/jennybc/row-oriented-workflows/blob/master/iterate-over-rows.md | |
spc_mean_list <- stats::setNames( | |
map(purrr::transpose(spc_mean_noid), data.table::as.data.table), | |
group_id_mean | |
) | |
# Convert averaged spectra and sample_id to tibble | |
spc_mean_tbl <- tibble::tibble( | |
!!by := group_id_mean, spc_mean = spc_mean_list | |
) | |
# Join mean spectra tibble spc_tbl_mean to spc_tbl | |
spc_tbl_out <- dplyr::left_join(spc_tbl, spc_mean_tbl, | |
by = rlang::quo_name(by)) | |
return(spc_tbl_out) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment