Last active
November 28, 2023 03:28
-
-
Save dgrapov/5166570 to your computer and use it in GitHub Desktop.
Orthogonal Signal Correction for PLS models (OPLS)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#see updated code base and some examples in the function "test" | |
# https://github.com/dgrapov/devium/blob/master/R/Devium%20PLS%20%20and%20OPLS.r | |
#Orthogonal Signal Correction for PLS models (OPLS) | |
#adapted from an example in the book <a href="http://www.springer.com/life+sciences/systems+biology+an+bioinfomatics/book/978-3-642-17840-5">"Chemometrics with R by Ron Wehrens"</a> | |
#this code requires the following packages: | |
need.packages<-c("pls", # to generate PLS models | |
"ggplot2" ) # to plot results | |
#I will use sample data set (glycans in humans) from a google spreadsheet which can be found here | |
#https://docs.google.com/spreadsheet/ccc?key=0Ap1AEMfo-fh9dHdxRkQtX08xQWdRNVB4VG5HZU9LLXc&usp=sharing | |
#as an example | |
#two main functions will be used, which are defined below | |
#1) OSC.correction --> to calculate OSC corrected models | |
#2) plot.OSC.results --> to plot OSC corrected results | |
#function to carry out orthogonal signal correction-PLS (~OPLS) | |
OSC.correction<-function(pls.y,pls.data,comp=5,OSC.comp=4,validation = "LOO",...){ # later open to all plsr options, ... | |
require(pls) | |
#initialize | |
OSC.results<-list() | |
OSC.results$data[[1]]<-pls.data | |
OSC.results$y[[1]]<-pls.y # also add a place to store plsr options for record keeping | |
# add a place to store plsr options for record keeping | |
#need to iteratively fit models for each OSC | |
for(i in 1:(OSC.comp+1)){ | |
data<-OSC.results$data[[i]] | |
tmp.model<-plsr(OSC.results$y[[1]]~., data = data, ncomp = comp, validation = validation)#,... | |
ww<-tmp.model$loading.weights[,1] | |
pp<-tmp.model$loadings[,1] | |
w.ortho<- pp - crossprod(ww,pp)/crossprod(ww)*ww | |
t.ortho<- as.matrix(pls.data) %*% w.ortho | |
p.ortho<- crossprod(as.matrix(data),t.ortho)/ c(crossprod(t.ortho)) | |
Xcorr<- data - tcrossprod(t.ortho,p.ortho) | |
#store results | |
OSC.results$RMSEP[[i]]<-matrix(RMSEP(tmp.model)$val,ncol=2,byrow=TRUE) | |
OSC.results$scores[[i]]<-tmp.model$scores | |
OSC.results$loadings[[i]]<-tmp.model$loadings | |
OSC.results$loading.weights[[i]]<-tmp.model$loading.weights | |
OSC.results$total.LVs[[i]]<-comp | |
OSC.results$OSC.LVs[[i]]<-i-1 # account for first model not having any OSC LVs | |
#initialize data for next round | |
OSC.results$data[[i+1]]<-as.data.frame(Xcorr) | |
} | |
return(OSC.results) | |
} | |
#function to plot OSC.results | |
plot.OSC.results<-function(obj,plot="RMSEP",groups=NULL){ | |
require(ggplot2) | |
#plot = one of: c("RMSEP","scores","loadings","delta.weights") | |
#groups is a factor to show group visuyalization in scores plot | |
switch(plot, | |
RMSEP = .local<-function(obj){ | |
#bind info and RMSEP | |
comps<-obj$total.LVs | |
ocomps<-obj$OSC.LVs | |
plot.obj<-obj$RMSEP | |
bound<-do.call("rbind",lapply(1:length(comps),function(i) | |
{ | |
out<-as.data.frame(cbind(plot.obj[[i]][,1],c(0:comps[i]),paste(comps[i]," LVs and ",ocomps[i]," OSC LVs",sep=""))) | |
colnames(out)<-c("RMSEP","component","model") | |
out | |
})) | |
bound[,1:2]<-as.numeric(as.matrix(bound[,1:2])) | |
#custom theme | |
.theme<- theme( | |
axis.line = element_line(colour = 'gray', size = .75), | |
panel.background = element_blank(), | |
plot.background = element_blank() | |
) | |
#plot | |
p<-ggplot(data=bound, aes(x=component, y=RMSEP, group=model,color=model)) + geom_line(size=1,alpha=.5) + geom_point(size=2)+.theme | |
print(p) | |
}, | |
scores = .local<-function(obj){ | |
comps<-obj$total.LVs | |
ocomps<-obj$OSC.LVs | |
plot.obj<-obj$scores | |
bound<-do.call("rbind",lapply(1:length(comps),function(i) | |
{ | |
out<-as.data.frame(cbind(plot.obj[[i]][,1:2],unlist(groups),paste(comps[i]," LVs and ",ocomps[i]," OSC LVs",sep=""))) | |
colnames(out)<-c("Comp1","Comp2","groups","model") | |
out | |
})) | |
bound[,1:2]<-as.numeric(as.matrix(bound[,1:2])) | |
#calculate convex hull for polygons for each group | |
data.obj <- split(bound, bound$model) | |
tmp.obj <- lapply(1:length(data.obj), function(i){ | |
obj<-data.obj[[i]] | |
s2<-split(obj,obj[,3]) | |
do.call(rbind,lapply(1:length(s2),function(j){ | |
tmp<-s2[[j]] | |
tmp[chull(tmp[,1:2]),] | |
})) | |
}) | |
chull.boundaries <- do.call("rbind", tmp.obj) | |
#custom theme | |
.theme<- theme( | |
axis.line = element_line(colour = 'gray', size = .75), | |
panel.background = element_blank(), | |
panel.border = element_rect(colour="gray",fill=NA), | |
plot.background = element_blank() | |
) | |
#make plot | |
p<-ggplot(data=bound, aes(x=Comp1, y=Comp2, group=groups,color=groups)) + #geom_density2d(aes(group=groups))+ | |
geom_hline(aes(yintercept=0),color="gray60",linetype="dashed")+geom_vline(aes(xintercept=0),color=I("gray60"),linetype=2)+facet_grid(. ~ model) | |
p<-p+geom_polygon(data=chull.boundaries,aes(x=Comp1,y=Comp2,fill=groups),alpha=.5) +geom_point(size=2)+.theme | |
print(p) | |
}, | |
loadings = .local<-function(obj){ # will only plot first component for each model | |
comps<-obj$total.LVs | |
ocomps<-obj$OSC.LVs | |
plot.obj<-obj$loadings | |
bound<-do.call("rbind",lapply(1:length(comps),function(i) | |
{ | |
out<-as.data.frame(cbind(plot.obj[[i]][,1:2],rownames(plot.obj[[i]]),paste(comps[i]," LVs and ",ocomps[i]," OSC LVs",sep=""))) | |
colnames(out)<-c("Comp1","Comp2","variable","model") | |
out | |
})) | |
bound[,1:2]<-as.numeric(as.matrix(bound[,1:2])) | |
#custom theme | |
.theme<- theme( | |
axis.line = element_line(colour = 'gray', size = .75), | |
panel.background = element_blank(), | |
legend.position = "none", | |
plot.background = element_blank() | |
) | |
#make plot | |
p<-ggplot(data=bound, aes(x=variable,y=Comp1, fill=variable)) + geom_bar(stat = "identity") + coord_flip() + #geom_density2d(aes(group=groups))+ | |
facet_grid(. ~ model) +.theme | |
print(p) | |
}, | |
delta.weights = .local<-function(obj){ # will only plot first component for each model | |
comps<-obj$total.LVs | |
ocomps<-obj$OSC.LVs | |
plot.obj<-obj$loading.weights | |
bound<-do.call("rbind",lapply(2:(length(ocomps)),function(i) | |
{ | |
out<-as.data.frame(cbind(plot.obj[[1]][,1]-plot.obj[[i]][,1],names(plot.obj[[i]][,1]),paste(comps[i]," LVs and ",ocomps[i]," OSC LVs",sep=""))) | |
colnames(out)<-c("delta_weight","variable","model") | |
out | |
})) | |
bound[,1]<-as.numeric(as.matrix(bound[,1])) | |
#theme | |
.theme<- theme( | |
axis.line = element_line(colour = 'gray', size = .75), | |
panel.background = element_blank(), | |
legend.position = "none", | |
plot.background = element_blank() | |
) | |
#make plot | |
p<-ggplot(data=bound, aes(x=variable,y=delta_weight, fill=variable)) + geom_bar(stat = "identity") + coord_flip() + #geom_density2d(aes(group=groups))+ | |
facet_grid(. ~ model) +.theme | |
print(p) | |
} | |
) | |
.local(obj) | |
} | |
#now import data and Y using which ever way you preffer | |
# I use devium (https://github.com/dgrapov/devium) | |
# now format and scale data for modeling | |
imported.data<-data# copied from clipboard | |
pls.data<-data.frame(matrix(as.numeric(as.matrix(imported.data)),nrow(imported.data),ncol(imported.data))) # make sure its numeric | |
dimnames(pls.data)<-dimnames(imported.data) | |
#Y (object to predict) | |
pls.y<-as.numeric(as.character(unlist(Y))) # Disease_Status | |
#auto scaling of data | |
scaled.data<-as.data.frame(scale(pls.data,scale=apply(pls.data,2,sd),center=colMeans(pls.data))) | |
#compare a model with 0 or 1 OSC components and 10 total components | |
# use SIMPLS algorithm for calculating components and leave-one-out internal cross-validation for parameter estimation | |
mods<-OSC.correction(pls.y,pls.data,comp=10,OSC.comp=1,validation = "LOO",methods="oscorespls") | |
#visualize predictive error for trainning data (RMSEP) | |
plot.OSC.results(mods,plot="RMSEP",groups=groups) | |
# based on this we can see that 2 copmponents minimize the error for both 0 or 1 OSC models | |
# recalculate OSC model using 1 OSC and 2 total components ( 1 non - OSC) | |
mods<-OSC.correction(pls.y,pls.data,comp=2,OSC.comp=1,validation = "LOO",methods="oscorespls") | |
#visualize scores comparing 0 and 1 OSC models | |
#I set groups = pls.y to display polygons around each class | |
plot.OSC.results(mods,plot="delta.weights",groups=pls.y) | |
#now compare model variable loadings between 0 and 1 OSC models | |
plot.OSC.results(mods,plot="loadings") | |
#finally identify changes (delta) in variable model weights (beta weights) between 0 and 1 OSC models | |
plot.OSC.results(mods,plot="delta.weights") | |
Many thanks for your code. It is very useful.
I only have one doubt and it is regarding the way that you calculate t_ortho (t.ortho<- as.matrix(pls.data) %% w.ortho). I wonder if this new t_ortho, what I understand, should be calculated on the corrected signal or previous residuals (as.matrix(data) %% w.ortho), or I'm wrong? Thank you again
The code is adapted from the one shown below.
See link above section: 11.4 Orthogonal Signal Correction and OPLS pg: 267
> gasolineSC <- gasoline
> gasolineSC$NIR <-
+ scale(gasolineSC$NIR, scale = FALSE,
+ center = colMeans(gasolineSC$NIR[gas.odd, ]))
> gasolineSC.pls <- plsr(octane ˜ ., data = gasolineSC, ncomp = 5,
+ subset = gas.odd, validation = "LOO")
> ww <- gasolineSC.pls$loading.weights[, 1]
> pp <- gasolineSC.pls$loadings[, 1]
> w.ortho <- pp - c(crossprod(ww, pp)/crossprod(ww)) * ww
> t.ortho <- gasolineSC$NIR[gas.odd, ] %*% w.ortho
> p.ortho <- crossprod(gasolineSC$NIR[gas.odd, ], t.ortho) /
+ c(crossprod(t.ortho))
> Xcorr <- gasolineSC$NIR[gas.odd, ] - tcrossprod(t.ortho, p.ortho)
# Next, a new PLS model is created using the corrected data matrix:
> gasolineSC.osc1 <- data.frame(octane = gasolineSC$octane[gas.odd],
+ NIR = Xcorr)
> gasolineSC.opls1 <- plsr(octane ˜ ., data = gasolineSC.osc1,
+ ncomp = 5, validation = "LOO")
# Removal of a second OSC component proceeds along the same lines:
> pp2 <- gasolineSC.opls1$loadings[, 1]
> w.ortho2 <- pp2 - c(crossprod(ww, pp2)/crossprod(ww)) * ww
> t.ortho2 <- Xcorr %*% w.ortho2
> p.ortho2 <- crossprod(Xcorr, t.ortho2) / c(crossprod(t.ortho2))
> Xcorr2 <- Xcorr - tcrossprod(t.ortho2, p.ortho2)
> gasolineSC.osc2 <- data.frame(octane = gasolineSC$octane[gas.odd],
+ NIR = Xcorr2)
> gasolineSC.opls2 <- plsr(octane ˜ ., data = gasolineSC.osc2,
+ ncomp = 5, validation = "LOO")
Its been a while since I looked at this code. A good unit test would be to compare an equivalent example data using my and the original code. Looking at this now it does look like a bug not to iterate the data used for that step in the loop. Let me know what you find :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The OSC correction algorithm proceeds as PLS --> OSC --> iterate for all latent variables. If you just want PLS you could try setting OSC.comp=0 or using the pls library directly.
Depending on what you want the main output, for each component, contains:
RMSEP - root mean squared error of projection
scores - sample projection
loadings - variable projection
loading.weights - variable coefficient weights
data - OSC corrected data
scaled.data<-as.data.frame(scale(pls.data,scale=apply(pls.data,2,sd),center=colMeans(pls.data))). Is it also like a SVN normalization? Does it redundant with my corrections?
This step is often done before multivariate analysis (for each variable, subtract the mean and divide by the standard deviation). This is helpful for comparing differences between variables with large differences in magnitude e.g. compare a variable with range 1-10 to one with 100-1000. You can skip this step, but then the first 2 model dimensions will be overwhelmed by it.