Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save dgrapov/5166570 to your computer and use it in GitHub Desktop.
Save dgrapov/5166570 to your computer and use it in GitHub Desktop.
Orthogonal Signal Correction for PLS models (OPLS)
#see updated code base and some examples in the function "test"
# https://github.com/dgrapov/devium/blob/master/R/Devium%20PLS%20%20and%20OPLS.r
#Orthogonal Signal Correction for PLS models (OPLS)
#adapted from an example in the book <a href="http://www.springer.com/life+sciences/systems+biology+an+bioinfomatics/book/978-3-642-17840-5">"Chemometrics with R by Ron Wehrens"</a>
#this code requires the following packages:
need.packages<-c("pls", # to generate PLS models
"ggplot2" ) # to plot results
#I will use sample data set (glycans in humans) from a google spreadsheet which can be found here
#https://docs.google.com/spreadsheet/ccc?key=0Ap1AEMfo-fh9dHdxRkQtX08xQWdRNVB4VG5HZU9LLXc&usp=sharing
#as an example
#two main functions will be used, which are defined below
#1) OSC.correction --> to calculate OSC corrected models
#2) plot.OSC.results --> to plot OSC corrected results
#function to carry out orthogonal signal correction-PLS (~OPLS)
OSC.correction<-function(pls.y,pls.data,comp=5,OSC.comp=4,validation = "LOO",...){ # later open to all plsr options, ...
require(pls)
#initialize
OSC.results<-list()
OSC.results$data[[1]]<-pls.data
OSC.results$y[[1]]<-pls.y # also add a place to store plsr options for record keeping
# add a place to store plsr options for record keeping
#need to iteratively fit models for each OSC
for(i in 1:(OSC.comp+1)){
data<-OSC.results$data[[i]]
tmp.model<-plsr(OSC.results$y[[1]]~., data = data, ncomp = comp, validation = validation)#,...
ww<-tmp.model$loading.weights[,1]
pp<-tmp.model$loadings[,1]
w.ortho<- pp - crossprod(ww,pp)/crossprod(ww)*ww
t.ortho<- as.matrix(pls.data) %*% w.ortho
p.ortho<- crossprod(as.matrix(data),t.ortho)/ c(crossprod(t.ortho))
Xcorr<- data - tcrossprod(t.ortho,p.ortho)
#store results
OSC.results$RMSEP[[i]]<-matrix(RMSEP(tmp.model)$val,ncol=2,byrow=TRUE)
OSC.results$scores[[i]]<-tmp.model$scores
OSC.results$loadings[[i]]<-tmp.model$loadings
OSC.results$loading.weights[[i]]<-tmp.model$loading.weights
OSC.results$total.LVs[[i]]<-comp
OSC.results$OSC.LVs[[i]]<-i-1 # account for first model not having any OSC LVs
#initialize data for next round
OSC.results$data[[i+1]]<-as.data.frame(Xcorr)
}
return(OSC.results)
}
#function to plot OSC.results
plot.OSC.results<-function(obj,plot="RMSEP",groups=NULL){
require(ggplot2)
#plot = one of: c("RMSEP","scores","loadings","delta.weights")
#groups is a factor to show group visuyalization in scores plot
switch(plot,
RMSEP = .local<-function(obj){
#bind info and RMSEP
comps<-obj$total.LVs
ocomps<-obj$OSC.LVs
plot.obj<-obj$RMSEP
bound<-do.call("rbind",lapply(1:length(comps),function(i)
{
out<-as.data.frame(cbind(plot.obj[[i]][,1],c(0:comps[i]),paste(comps[i]," LVs and ",ocomps[i]," OSC LVs",sep="")))
colnames(out)<-c("RMSEP","component","model")
out
}))
bound[,1:2]<-as.numeric(as.matrix(bound[,1:2]))
#custom theme
.theme<- theme(
axis.line = element_line(colour = 'gray', size = .75),
panel.background = element_blank(),
plot.background = element_blank()
)
#plot
p<-ggplot(data=bound, aes(x=component, y=RMSEP, group=model,color=model)) + geom_line(size=1,alpha=.5) + geom_point(size=2)+.theme
print(p)
},
scores = .local<-function(obj){
comps<-obj$total.LVs
ocomps<-obj$OSC.LVs
plot.obj<-obj$scores
bound<-do.call("rbind",lapply(1:length(comps),function(i)
{
out<-as.data.frame(cbind(plot.obj[[i]][,1:2],unlist(groups),paste(comps[i]," LVs and ",ocomps[i]," OSC LVs",sep="")))
colnames(out)<-c("Comp1","Comp2","groups","model")
out
}))
bound[,1:2]<-as.numeric(as.matrix(bound[,1:2]))
#calculate convex hull for polygons for each group
data.obj <- split(bound, bound$model)
tmp.obj <- lapply(1:length(data.obj), function(i){
obj<-data.obj[[i]]
s2<-split(obj,obj[,3])
do.call(rbind,lapply(1:length(s2),function(j){
tmp<-s2[[j]]
tmp[chull(tmp[,1:2]),]
}))
})
chull.boundaries <- do.call("rbind", tmp.obj)
#custom theme
.theme<- theme(
axis.line = element_line(colour = 'gray', size = .75),
panel.background = element_blank(),
panel.border = element_rect(colour="gray",fill=NA),
plot.background = element_blank()
)
#make plot
p<-ggplot(data=bound, aes(x=Comp1, y=Comp2, group=groups,color=groups)) + #geom_density2d(aes(group=groups))+
geom_hline(aes(yintercept=0),color="gray60",linetype="dashed")+geom_vline(aes(xintercept=0),color=I("gray60"),linetype=2)+facet_grid(. ~ model)
p<-p+geom_polygon(data=chull.boundaries,aes(x=Comp1,y=Comp2,fill=groups),alpha=.5) +geom_point(size=2)+.theme
print(p)
},
loadings = .local<-function(obj){ # will only plot first component for each model
comps<-obj$total.LVs
ocomps<-obj$OSC.LVs
plot.obj<-obj$loadings
bound<-do.call("rbind",lapply(1:length(comps),function(i)
{
out<-as.data.frame(cbind(plot.obj[[i]][,1:2],rownames(plot.obj[[i]]),paste(comps[i]," LVs and ",ocomps[i]," OSC LVs",sep="")))
colnames(out)<-c("Comp1","Comp2","variable","model")
out
}))
bound[,1:2]<-as.numeric(as.matrix(bound[,1:2]))
#custom theme
.theme<- theme(
axis.line = element_line(colour = 'gray', size = .75),
panel.background = element_blank(),
legend.position = "none",
plot.background = element_blank()
)
#make plot
p<-ggplot(data=bound, aes(x=variable,y=Comp1, fill=variable)) + geom_bar(stat = "identity") + coord_flip() + #geom_density2d(aes(group=groups))+
facet_grid(. ~ model) +.theme
print(p)
},
delta.weights = .local<-function(obj){ # will only plot first component for each model
comps<-obj$total.LVs
ocomps<-obj$OSC.LVs
plot.obj<-obj$loading.weights
bound<-do.call("rbind",lapply(2:(length(ocomps)),function(i)
{
out<-as.data.frame(cbind(plot.obj[[1]][,1]-plot.obj[[i]][,1],names(plot.obj[[i]][,1]),paste(comps[i]," LVs and ",ocomps[i]," OSC LVs",sep="")))
colnames(out)<-c("delta_weight","variable","model")
out
}))
bound[,1]<-as.numeric(as.matrix(bound[,1]))
#theme
.theme<- theme(
axis.line = element_line(colour = 'gray', size = .75),
panel.background = element_blank(),
legend.position = "none",
plot.background = element_blank()
)
#make plot
p<-ggplot(data=bound, aes(x=variable,y=delta_weight, fill=variable)) + geom_bar(stat = "identity") + coord_flip() + #geom_density2d(aes(group=groups))+
facet_grid(. ~ model) +.theme
print(p)
}
)
.local(obj)
}
#now import data and Y using which ever way you preffer
# I use devium (https://github.com/dgrapov/devium)
# now format and scale data for modeling
imported.data<-data# copied from clipboard
pls.data<-data.frame(matrix(as.numeric(as.matrix(imported.data)),nrow(imported.data),ncol(imported.data))) # make sure its numeric
dimnames(pls.data)<-dimnames(imported.data)
#Y (object to predict)
pls.y<-as.numeric(as.character(unlist(Y))) # Disease_Status
#auto scaling of data
scaled.data<-as.data.frame(scale(pls.data,scale=apply(pls.data,2,sd),center=colMeans(pls.data)))
#compare a model with 0 or 1 OSC components and 10 total components
# use SIMPLS algorithm for calculating components and leave-one-out internal cross-validation for parameter estimation
mods<-OSC.correction(pls.y,pls.data,comp=10,OSC.comp=1,validation = "LOO",methods="oscorespls")
#visualize predictive error for trainning data (RMSEP)
plot.OSC.results(mods,plot="RMSEP",groups=groups)
# based on this we can see that 2 copmponents minimize the error for both 0 or 1 OSC models
# recalculate OSC model using 1 OSC and 2 total components ( 1 non - OSC)
mods<-OSC.correction(pls.y,pls.data,comp=2,OSC.comp=1,validation = "LOO",methods="oscorespls")
#visualize scores comparing 0 and 1 OSC models
#I set groups = pls.y to display polygons around each class
plot.OSC.results(mods,plot="delta.weights",groups=pls.y)
#now compare model variable loadings between 0 and 1 OSC models
plot.OSC.results(mods,plot="loadings")
#finally identify changes (delta) in variable model weights (beta weights) between 0 and 1 OSC models
plot.OSC.results(mods,plot="delta.weights")
@dgrapov
Copy link
Author

dgrapov commented Mar 22, 2019

  • Which parameter of the OSC.correction function can I use, to do a PLS model ?

The OSC correction algorithm proceeds as PLS --> OSC --> iterate for all latent variables. If you just want PLS you could try setting OSC.comp=0 or using the pls library directly.

  • Which parameter contains the OSC corrected values?

Depending on what you want the main output, for each component, contains:

RMSEP - root mean squared error of projection
scores - sample projection
loadings - variable projection
loading.weights - variable coefficient weights
data - OSC corrected data

  • In my analysis, I am using spectra after SVN and SG corrections and I have seen in your script the following sentence : #auto scaling of data
    scaled.data<-as.data.frame(scale(pls.data,scale=apply(pls.data,2,sd),center=colMeans(pls.data))). Is it also like a SVN normalization? Does it redundant with my corrections?

This step is often done before multivariate analysis (for each variable, subtract the mean and divide by the standard deviation). This is helpful for comparing differences between variables with large differences in magnitude e.g. compare a variable with range 1-10 to one with 100-1000. You can skip this step, but then the first 2 model dimensions will be overwhelmed by it.

@anaguilarar
Copy link

Many thanks for your code. It is very useful.
I only have one doubt and it is regarding the way that you calculate t_ortho (t.ortho<- as.matrix(pls.data) %% w.ortho). I wonder if this new t_ortho, what I understand, should be calculated on the corrected signal or previous residuals (as.matrix(data) %% w.ortho), or I'm wrong? Thank you again

@dgrapov
Copy link
Author

dgrapov commented Nov 28, 2023

The code is adapted from the one shown below.
See link above section: 11.4 Orthogonal Signal Correction and OPLS pg: 267

> gasolineSC <- gasoline
> gasolineSC$NIR <-
+ scale(gasolineSC$NIR, scale = FALSE,
+ center = colMeans(gasolineSC$NIR[gas.odd, ]))
> gasolineSC.pls <- plsr(octane ˜ ., data = gasolineSC, ncomp = 5,
+ subset = gas.odd, validation = "LOO")
> ww <- gasolineSC.pls$loading.weights[, 1]
> pp <- gasolineSC.pls$loadings[, 1]
> w.ortho <- pp - c(crossprod(ww, pp)/crossprod(ww)) * ww
> t.ortho <- gasolineSC$NIR[gas.odd, ] %*% w.ortho
> p.ortho <- crossprod(gasolineSC$NIR[gas.odd, ], t.ortho) /
+ c(crossprod(t.ortho))
> Xcorr <- gasolineSC$NIR[gas.odd, ] - tcrossprod(t.ortho, p.ortho)

# Next, a new PLS model is created using the corrected data matrix:
> gasolineSC.osc1 <- data.frame(octane = gasolineSC$octane[gas.odd],
+ NIR = Xcorr)
> gasolineSC.opls1 <- plsr(octane ˜ ., data = gasolineSC.osc1,
+ ncomp = 5, validation = "LOO")

# Removal of a second OSC component proceeds along the same lines:
> pp2 <- gasolineSC.opls1$loadings[, 1]
> w.ortho2 <- pp2 - c(crossprod(ww, pp2)/crossprod(ww)) * ww
> t.ortho2 <- Xcorr %*% w.ortho2
> p.ortho2 <- crossprod(Xcorr, t.ortho2) / c(crossprod(t.ortho2))
> Xcorr2 <- Xcorr - tcrossprod(t.ortho2, p.ortho2)
> gasolineSC.osc2 <- data.frame(octane = gasolineSC$octane[gas.odd],
+ NIR = Xcorr2)
> gasolineSC.opls2 <- plsr(octane ˜ ., data = gasolineSC.osc2,
+ ncomp = 5, validation = "LOO")

Its been a while since I looked at this code. A good unit test would be to compare an equivalent example data using my and the original code. Looking at this now it does look like a bug not to iterate the data used for that step in the loop. Let me know what you find :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment