Created
March 27, 2014 20:46
-
-
Save peterk87/9818256 to your computer and use it in GitHub Desktop.
R: Get SNP distance matrix from multiple MSAs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #! /usr/bin/Rscript --vanilla | |
| library(getopt) | |
| spec <- matrix(c( | |
| 'msa_dir_path','d',1,'character','MSA directory path (required)' | |
| ,'msa_file_ext','e',2,'character','MSA file extension (optional; default: "aln")' | |
| ,'out','o',2,'character','Output core SNP matrix CSV filename (optional; default: "core_distance_matrix.csv")' | |
| ,'n_cores','c',2,'integer','Number of cores to use for computation (optional; default: 2)' | |
| ,'dna_distance_model','m',2,'character','DNA distance model (default: "N"). | |
| Can be one of the following: | |
| "raw" | |
| "N" | |
| "TS" | |
| "TV" | |
| "JC69" | |
| "K80" | |
| "F81" | |
| "K81" | |
| "F84" | |
| "BH87" | |
| "T92" | |
| "TN93" | |
| "GG95" | |
| "logdet" | |
| "paralin" | |
| "indel" | |
| "indelblock" | |
| See ape::dist.dna help for more details.' | |
| ,'help','h',0,'logical','this help' | |
| ) | |
| ,ncol=5 | |
| ,byrow=TRUE) | |
| opt <- getopt(spec) | |
| # if help selected or MSA directory path not specified, show help and exit | |
| if(!is.null(opt$help) || is.null(opt$msa_dir_path)) | |
| { | |
| cat('Get core SNP matrix from multiple Multiple Sequence Alignments (MSA) | |
| This script requires the following libraries: | |
| - ape | |
| - snow | |
| - getopt | |
| \n') | |
| cat(paste(getopt(spec, usage=TRUE))) | |
| q(status=1) | |
| } | |
| library(snow, quietly=TRUE, warn.conflicts=FALSE) | |
| log <- function(...) | |
| { | |
| cat(timestamp(prefix='[',suffix='] ::: ', quiet=TRUE), ... , '\n') | |
| } | |
| # set defaults for parameters | |
| if(is.null(opt$msa_file_ext)) { opt$msa_file_ext <- 'aln' } | |
| if(is.null(opt$out)) { opt$out <- 'core_distance_matrix.csv' } | |
| if(is.null(opt$n_cores)) { opt$n_cores <- 2 } | |
| if(is.null(opt$dna_distance_model)) { opt$dna_distance_model <- 'N' } | |
| log('Getting core SNP matrix with the following parameters:') | |
| log('MSA directory:', opt$msa_dir_path) | |
| log('MSA file extension:', opt$msa_file_ext) | |
| log('DNA distance model:', opt$dna_distance_model) | |
| log('Number of cores:', opt$n_cores) | |
| log('Distance matrix ouput filename:', opt$out) | |
| MSA_FILE_REGEX_PATTERN <- paste0('\\.', opt$msa_file_ext, '$') | |
| if(opt$n_cores > 1) | |
| { | |
| log('Setting up', opt$n_cores,'cluster nodes') | |
| cl <- makeSOCKcluster(names=rep('localhost', opt$n_cores)) | |
| log('Cluster nodes setup!') | |
| } | |
| alns <- list.files(path=opt$msa_dir_path, pattern=MSA_FILE_REGEX_PATTERN, full.name=TRUE) | |
| log('Retrieved', length(alns), 'MSAs from', opt$msa_dir_path) | |
| dna_distance_model <- opt$dna_distance_model | |
| if(opt$n_cores > 1){ | |
| # Load ape library on each cluster node | |
| log('Loading ape library on each cluster node') | |
| invisible(clusterCall(cl, fun=function() library(ape))) | |
| log('Done! ape library loaded on each cluster node') | |
| # use parallel list apply to read each MSA and compute a pairwise distance | |
| # matrix for each using the 'N' model (number of non-gap SNPs) | |
| log('Setting DNA distance model', dna_distance_model,'on each cluster node') | |
| clusterExport(cl=cl, list='dna_distance_model') | |
| log('Done! DNA distance model', dna_distance_model,'set on each cluster node') | |
| log('Computing distance matrices in parallel for each of', length(alns),'MSAs') | |
| dms <- parLapply(cl, alns, function(aln) as.matrix(dist.dna(read.dna(aln, format='fasta'), model=dna_distance_model))) | |
| log('Done! Distance matrices computed!') | |
| }else{ | |
| library(ape) | |
| log('Computing distance matrices using 1 core for each of', length(alns),'MSAs') | |
| dms <- lapply(alns, function(aln) as.matrix(dist.dna(read.dna(aln, format='fasta'), model=dna_distance_model))) | |
| log( | |
| 'Done! Distance matrices computed!') | |
| } | |
| # the order of column/row names cannot be guaranteed between all matrices so | |
| # it's necessary to order all matrices the same | |
| # get the row names for the first distance matrix in the list of matrices | |
| log('Reordering distance matrices') | |
| dm1 <- dms[[1]] | |
| row_names_dm1 <- rownames(dm1) | |
| # get a list of matrices that have all been ordered in the same way | |
| dms_ordered <- lapply(dms, function(x) x[row_names_dm1, row_names_dm1]) | |
| log('Done! Distance matrices reordered') | |
| # define a general Reduce addition function | |
| add <- function(x) Reduce("+", x) | |
| # use the add reduce function to add all of the distance matrices together | |
| log('Adding distance matrices together') | |
| dm <- add(dms_ordered) | |
| log('Done! Distance matrices added together') | |
| # output the distance matrix as a CSV | |
| write.csv(dm, opt$out) | |
| log('Distance matrix written to', opt$out) | |
| log('Done!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment