philipp-baumann · December 29, 2017 11:55
diff --git a/02_predict-from-spectra-example.R b/02_predict-from-spectra-example.R
 ################################################################################
 ## Task: Example script to predict spectra in simplerspec framework
 ################################################################################

 # Remove all R objects from memory
 rm(list = ls())

 # Load packages
 pkgs <- c("tidyverse", "simplerspec")
 lapply(pkgs, library, character.only = TRUE)

 ## Register parallel backend for using multiple cores ==========================

 # Allows to tune the models using parallel processing (e.g. use all available
 # cores of a CPU); caret package automatically detects the registered backend
 library(doParallel)
 # Make a cluster with all possible threads (more than physical cores)
 cl <- makeCluster(detectCores())
 # Register backend
 registerDoParallel(cl)
 # Return number of parallel workers
 getDoParWorkers() # 8 threads on MacBook Pro (Retina, 15-inch, Mid 2015);
 # Quadcore processor


 ## Read spectra from OPUS files ================================================

 # Create vector with list of OPUS files to be read
 lf <- dir(path = "data/spectra", full.names = TRUE)

 # Read into list
 spc_l <- read_opus_univ(fnames = lf, parallel = TRUE)
 # You can also read in single core modus (files are not read in parallel 
 # using all cores of the computer); ~ 6 times slower...
 # spc_l <- read_opus_univ(fnames = lf, parallel = FALSE)

 ## Gather and process spectra ==================================================

 # Spectral processing pipe
 spc_tbl <- spc_l %>%
  # Gather list of spectra data into tibble data frame
  gather_spc()

 # Investigate spectral tibble (data.frame)
 # Get raw spectrum from first sample
 dim(spc_tbl$spc[[1]])


 spc_tbl <- spc_tbl %>%
  # Resample spectra to new wavenumber interval
  resample_spc(wn_lower = 500, wn_upper = 3996, wn_interval = 2) %>%
  # Average replicate scans per sample_id
  average_spc() %>%
  # Preprocess spectra using Savitzky-Golay first derivative with a window size
  # of 21 points
  preprocess_spc(select = "sg_1_w21")
  

 ## Predict spectra based on YAMSYS reference models ============================

 # (1) Read model output from reference models ----------------------------------

 # Example: model output of total carbon (C) model
 pls_C <- readRDS("models/yamsys-soil-models/pls_C.Rds")

 # Check elements of pls_C list
 ls(pls_C) # list all element names
 pls_C$data # Spectral tibble
 pls_C$p_pc # Plot with PC1 and PC2 of spectral PCA
 pls_C$model # caret::train() output, can e.g. be used for prediction
 # Check class of model output
 class(pls_C$model) # object of class "train"
 pls_C$p_model # Plot model evaluation summary of calibration model

 # Create list of models; is subsequently used by 
 # simplerspec::predict_from_spc function
 models <- list(
  "pls_C" = pls_C
 )

 # (2) Predict new spectral data (preprocessed data) based on reference
 # calibration model ------------------------------------------------------------

 # Predict into tibble data frame
 prediction_df <- predict_from_spc(model_list = models, spc_tbl = spc_tbl)

 # Extract predictions as vector (column pls_C)
 prediction_df$pls_C

 # Check if is data.frame
 class(prediction_df)
 is.data.frame(prediction_df)


 # Save predictions as csv file -------------------------------------------------

 # Define helper function to negate is_list
 is_not_list <- function(x) ! purrr::is_list(x)

 # Pipe to save selected columns on disk (csv file)
 prediction_df %>% 
  # First only select columns that are no lists (lists cannot be written to 
  # csv direcly; would only be possible to store in JSON file)
  select_if(is_not_list) %>% 
  # now write all columns that are not list-columns as csv file
  write_csv(x = ., path = "predictions/test-soil-pred.csv")
	################################################################################
	## Task: Example script to predict spectra in simplerspec framework
	################################################################################

	# Remove all R objects from memory
	rm(list = ls())

	# Load packages
	pkgs <- c("tidyverse", "simplerspec")
	lapply(pkgs, library, character.only = TRUE)

	## Register parallel backend for using multiple cores ==========================

	# Allows to tune the models using parallel processing (e.g. use all available
	# cores of a CPU); caret package automatically detects the registered backend
	library(doParallel)
	# Make a cluster with all possible threads (more than physical cores)
	cl <- makeCluster(detectCores())
	# Register backend
	registerDoParallel(cl)
	# Return number of parallel workers
	getDoParWorkers() # 8 threads on MacBook Pro (Retina, 15-inch, Mid 2015);
	# Quadcore processor


	## Read spectra from OPUS files ================================================

	# Create vector with list of OPUS files to be read
	lf <- dir(path = "data/spectra", full.names = TRUE)

	# Read into list
	spc_l <- read_opus_univ(fnames = lf, parallel = TRUE)
	# You can also read in single core modus (files are not read in parallel
	# using all cores of the computer); ~ 6 times slower...
	# spc_l <- read_opus_univ(fnames = lf, parallel = FALSE)

	## Gather and process spectra ==================================================

	# Spectral processing pipe
	spc_tbl <- spc_l %>%
	# Gather list of spectra data into tibble data frame
	gather_spc()

	# Investigate spectral tibble (data.frame)
	# Get raw spectrum from first sample
	dim(spc_tbl$spc[[1]])


	spc_tbl <- spc_tbl %>%
	# Resample spectra to new wavenumber interval
	resample_spc(wn_lower = 500, wn_upper = 3996, wn_interval = 2) %>%
	# Average replicate scans per sample_id
	average_spc() %>%
	# Preprocess spectra using Savitzky-Golay first derivative with a window size
	# of 21 points
	preprocess_spc(select = "sg_1_w21")


	## Predict spectra based on YAMSYS reference models ============================

	# (1) Read model output from reference models ----------------------------------

	# Example: model output of total carbon (C) model
	pls_C <- readRDS("models/yamsys-soil-models/pls_C.Rds")

	# Check elements of pls_C list
	ls(pls_C) # list all element names
	pls_C$data # Spectral tibble
	pls_C$p_pc # Plot with PC1 and PC2 of spectral PCA
	pls_C$model # caret::train() output, can e.g. be used for prediction
	# Check class of model output
	class(pls_C$model) # object of class "train"
	pls_C$p_model # Plot model evaluation summary of calibration model

	# Create list of models; is subsequently used by
	# simplerspec::predict_from_spc function
	models <- list(
	"pls_C" = pls_C
	)

	# (2) Predict new spectral data (preprocessed data) based on reference
	# calibration model ------------------------------------------------------------

	# Predict into tibble data frame
	prediction_df <- predict_from_spc(model_list = models, spc_tbl = spc_tbl)

	# Extract predictions as vector (column pls_C)
	prediction_df$pls_C

	# Check if is data.frame
	class(prediction_df)
	is.data.frame(prediction_df)


	# Save predictions as csv file -------------------------------------------------

	# Define helper function to negate is_list
	is_not_list <- function(x) ! purrr::is_list(x)

	# Pipe to save selected columns on disk (csv file)
	prediction_df %>%
	# First only select columns that are no lists (lists cannot be written to
	# csv direcly; would only be possible to store in JSON file)
	select_if(is_not_list) %>%
	# now write all columns that are not list-columns as csv file
	write_csv(x = ., path = "predictions/test-soil-pred.csv")