Skip to content

Instantly share code, notes, and snippets.

@peterk87
Created March 27, 2014 20:46
Show Gist options
  • Select an option

  • Save peterk87/9818256 to your computer and use it in GitHub Desktop.

Select an option

Save peterk87/9818256 to your computer and use it in GitHub Desktop.
R: Get SNP distance matrix from multiple MSAs
#! /usr/bin/Rscript --vanilla
library(getopt)
spec <- matrix(c(
'msa_dir_path','d',1,'character','MSA directory path (required)'
,'msa_file_ext','e',2,'character','MSA file extension (optional; default: "aln")'
,'out','o',2,'character','Output core SNP matrix CSV filename (optional; default: "core_distance_matrix.csv")'
,'n_cores','c',2,'integer','Number of cores to use for computation (optional; default: 2)'
,'dna_distance_model','m',2,'character','DNA distance model (default: "N").
Can be one of the following:
"raw"
"N"
"TS"
"TV"
"JC69"
"K80"
"F81"
"K81"
"F84"
"BH87"
"T92"
"TN93"
"GG95"
"logdet"
"paralin"
"indel"
"indelblock"
See ape::dist.dna help for more details.'
,'help','h',0,'logical','this help'
)
,ncol=5
,byrow=TRUE)
opt <- getopt(spec)
# if help selected or MSA directory path not specified, show help and exit
if(!is.null(opt$help) || is.null(opt$msa_dir_path))
{
cat('Get core SNP matrix from multiple Multiple Sequence Alignments (MSA)
This script requires the following libraries:
- ape
- snow
- getopt
\n')
cat(paste(getopt(spec, usage=TRUE)))
q(status=1)
}
library(snow, quietly=TRUE, warn.conflicts=FALSE)
log <- function(...)
{
cat(timestamp(prefix='[',suffix='] ::: ', quiet=TRUE), ... , '\n')
}
# set defaults for parameters
if(is.null(opt$msa_file_ext)) { opt$msa_file_ext <- 'aln' }
if(is.null(opt$out)) { opt$out <- 'core_distance_matrix.csv' }
if(is.null(opt$n_cores)) { opt$n_cores <- 2 }
if(is.null(opt$dna_distance_model)) { opt$dna_distance_model <- 'N' }
log('Getting core SNP matrix with the following parameters:')
log('MSA directory:', opt$msa_dir_path)
log('MSA file extension:', opt$msa_file_ext)
log('DNA distance model:', opt$dna_distance_model)
log('Number of cores:', opt$n_cores)
log('Distance matrix ouput filename:', opt$out)
MSA_FILE_REGEX_PATTERN <- paste0('\\.', opt$msa_file_ext, '$')
if(opt$n_cores > 1)
{
log('Setting up', opt$n_cores,'cluster nodes')
cl <- makeSOCKcluster(names=rep('localhost', opt$n_cores))
log('Cluster nodes setup!')
}
alns <- list.files(path=opt$msa_dir_path, pattern=MSA_FILE_REGEX_PATTERN, full.name=TRUE)
log('Retrieved', length(alns), 'MSAs from', opt$msa_dir_path)
dna_distance_model <- opt$dna_distance_model
if(opt$n_cores > 1){
# Load ape library on each cluster node
log('Loading ape library on each cluster node')
invisible(clusterCall(cl, fun=function() library(ape)))
log('Done! ape library loaded on each cluster node')
# use parallel list apply to read each MSA and compute a pairwise distance
# matrix for each using the 'N' model (number of non-gap SNPs)
log('Setting DNA distance model', dna_distance_model,'on each cluster node')
clusterExport(cl=cl, list='dna_distance_model')
log('Done! DNA distance model', dna_distance_model,'set on each cluster node')
log('Computing distance matrices in parallel for each of', length(alns),'MSAs')
dms <- parLapply(cl, alns, function(aln) as.matrix(dist.dna(read.dna(aln, format='fasta'), model=dna_distance_model)))
log('Done! Distance matrices computed!')
}else{
library(ape)
log('Computing distance matrices using 1 core for each of', length(alns),'MSAs')
dms <- lapply(alns, function(aln) as.matrix(dist.dna(read.dna(aln, format='fasta'), model=dna_distance_model)))
log(
'Done! Distance matrices computed!')
}
# the order of column/row names cannot be guaranteed between all matrices so
# it's necessary to order all matrices the same
# get the row names for the first distance matrix in the list of matrices
log('Reordering distance matrices')
dm1 <- dms[[1]]
row_names_dm1 <- rownames(dm1)
# get a list of matrices that have all been ordered in the same way
dms_ordered <- lapply(dms, function(x) x[row_names_dm1, row_names_dm1])
log('Done! Distance matrices reordered')
# define a general Reduce addition function
add <- function(x) Reduce("+", x)
# use the add reduce function to add all of the distance matrices together
log('Adding distance matrices together')
dm <- add(dms_ordered)
log('Done! Distance matrices added together')
# output the distance matrix as a CSV
write.csv(dm, opt$out)
log('Distance matrix written to', opt$out)
log('Done!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment