Created
January 10, 2018 21:53
-
-
Save dgrapov/cdae87fc973f7f1bb7108c2f261d8344 to your computer and use it in GitHub Desktop.
fast (?) implementations of tanimoto distance calculations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @title fast_tanimoto | |
#' @param mat matrix or data frame of numeric values | |
#' @param output 'matrix' (default) or 'edge list' (non-redundant and undirected) | |
#' @param progress TRUE, show progress | |
#' @imports reshape2 | |
fast_tanimoto<-function(mat,output='matrix',progress=TRUE){ | |
mat[is.na(mat)]<-0 | |
#scoring function | |
score<-function(x){sum(x==2)/sum(x>0)} | |
if (progress == TRUE){ pb <- txtProgressBar(min = 0, max = nrow(mat), style = 3)} else {pb<-NULL} | |
out<-lapply(1:nrow(mat), function(i){ | |
tmp.mat<-sweep(mat,2,as.numeric(mat[i,]),"+") | |
if (progress == TRUE){setTxtProgressBar(pb, i)} | |
apply(tmp.mat,1,score) | |
}) | |
if (progress == TRUE){close(pb)} | |
res<-do.call("rbind",out) | |
rownames(res)<-rownames(mat) | |
if(output=='edge list'){ | |
res[upper.tri(res,diag=TRUE)]<-NA | |
na.omit(melt(res)) | |
} else { | |
return(res) | |
} | |
} | |
#' @title fast_tanimoto_par | |
#' @description parallel implementation of \code{\link{fast_tanimoto}} | |
#' @param mat matrix or data frame of numeric values | |
#' @param output 'matrix' (default) or 'edge list' (non-redundant and undirected) | |
#' @param progress TRUE, print progress to 'log.txt' | |
#' @imports reshape2, foreach, doMC | |
fast_tanimoto_par<-function(mat,output='matrix'){ | |
mat[is.na(mat)]<-0 | |
#scoring function | |
score<-function(x){sum(x==2)/sum(x>0)} | |
sink(paste0(getwd(),"/log.txt"), append=TRUE) | |
out<-foreach(i=1:nrow(mat)) %dopar% { | |
sink(paste0(getwd(),"/log.txt"), append=TRUE) | |
cat(paste("Starting iteration",i,"\n")) | |
sink() | |
tmp.mat<-sweep(mat,2,as.numeric(mat[i,]),"+") | |
apply(tmp.mat,1,score) | |
} | |
res<-do.call("rbind",out) #would have to convert each element to a data frame to use faster data.table::rbindlist | |
rownames(res)<-rownames(mat) | |
if(output=='edge list'){ | |
res[upper.tri(res,diag=TRUE)]<-NA | |
na.omit(melt(res)) | |
} else { | |
return(res) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment