Created
July 20, 2012 14:24
-
-
Save haxney/3151005 to your computer and use it in GitHub Desktop.
Testing speed improvement in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env Rscript | |
##' Generate a Selected Ion Chromatogram (SIC) from a run in a database. | |
##' | |
##' Retrieves spectra from a MySQL database and generates an SIC using the | |
##' specified parameters. Database connection information *must* be passed in | |
##' using the following environment variables: | |
##' | |
##' - MZDB_HOST: Computer on which the database is running. | |
##' - MZDB_USER: Username for authentication. | |
##' - MZDB_PW: Password for user. | |
##' - MZDB_DB: Database to which to connect. | |
##' | |
##' There are no default values for these settings. | |
##' | |
##' Generating the SIC requires two inputs: the run and the mass window. The run | |
##' can be identified by its ID in the database (--run-id), its name as it | |
##' appears in the mzML (--mzml-name), or the full file path of the raw file | |
##' (--raw-file-path). | |
##' | |
##' The mass window can be defined as a centerpoint (--mass) and window size | |
##' (--mass-window) or as a lower and upper bound (--mass-lower and | |
##' --mass-upper, respectively). | |
##' | |
##' All mass units are in milli-mass units (MMU). | |
library(optparse) | |
library(RODBC) | |
library(plyr) | |
library(bitops) | |
##' Tries to resolve a run ID from each of the arguments. | |
##' | |
##' If `run_id` is given, then that is returned without any further | |
##' investigation. Otherwise, the database is searched for a run which matches | |
##' the `mzml_name` or `raw_file_path` given. | |
##' | |
##' If none match, NULL is returned. | |
GetRunId <- function(conn, run_id=NULL, mzml_name=NULL, raw_file_path=NULL) { | |
## Create a character vector of `(varname, shQuote(value))` if the value of | |
## `varname` is a character vector, otherwise return NULL. | |
BuildCondition <- function(varname) { | |
if (is.character(get(varname))) { | |
c(varname, shQuote(get(varname))) | |
} else { | |
NULL | |
} | |
} | |
if (!is.null(run_id) && is.integer(as.integer(run_id))) { | |
return(run_id) | |
} | |
if (is.character(mzml_name) || is.character(raw_file_path)) { | |
## Create a data.frame using BuildCondition() | |
conditions <- ldply(c("mzml_name", "raw_file_path"), BuildCondition) | |
## Join the conditions together, separating multiple conditions with "OR" | |
whereClause <- do.call("paste", c(conditions, sep=" = ", collapse=" OR ")) | |
return(sqlQuery(conn, paste('SELECT id FROM runs WHERE ', whereClause))) | |
} | |
return(NULL) | |
} | |
##' Returns a 2-element list of the lower and upper bounds of the mass window. | |
##' | |
##' The caller should specify the mass window either by giving a centerpoint and | |
##' width or lower and upper bounds. This function returns a vector of lower and | |
##' upper bounds, or NULL if no window was provided. | |
GetMassWindow <- function(center, window, lower, upper) { | |
if (is.numeric(center) && is.numeric(window)) { | |
return(list(lower=center - window, upper=center + window)) | |
} else if (is.numeric(lower) && is.numeric(upper)) { | |
return(c(lower=lower, upper=upper)) | |
} | |
return(NULL) | |
} | |
option_list <- | |
list( | |
make_option(c('-v', '--verbose'), action='store_true', default=FALSE, | |
help='Print extra output'), | |
make_option(c('-i', '--run-id'), type='integer', metavar='RUN_ID', | |
help='Database ID of the run to use. This is the value of the `id` column in the `runs` table.'), | |
make_option(c('-n', '--mzml-name'), type='character', metavar='NAME', | |
help='mzML ID of the run to use. Corresponds to the `mzml_name` column in the `runs` table.\ | |
This is typically the name of the file with the suffix removed.'), | |
make_option(c('-r', '--raw-file-path'), type='character', metavar='PATH', | |
help='Absolute path of the original raw file.'), | |
make_option(c('-c', '--mass-center'), type='double', metavar='CENTER', | |
help='Centerpoint of mass window to analyze'), | |
make_option(c('-w', '--mass-window'), type='double', metavar='WINDOW-WIDTH', | |
help='Width of mass window. Will select masses within "--mass" +/- "--window".'), | |
make_option(c('-m', '--mass-lower'), type='double', metavar='LOWER', | |
help='Lower bound of the mass window'), | |
make_option(c('-M', '--mass-upper'), type='double', metavar='UPPER', | |
help='Upper bound of the mass window'), | |
make_option(c('-b', '--batch-size'), type='integer', metavar='BATCHSIZE', default=100, | |
help='The number of records to fetch and process at a time.') | |
) | |
opts <- parse_args(OptionParser(option_list=option_list)) | |
verbose <- opts$verbose | |
batchSize <- opts[['batch-size']] | |
## MySQL ODBC connection parameters | |
FLAG_NO_CACHE <- 1048576 | |
FLAG_FORWARD_CURSOR <- 2097152 | |
connection_string <- | |
sprintf(paste("DRIVER={%s};", | |
"SERVER=%s;", | |
"DATABASE=%s;", | |
"USER=%s;", | |
"PASSWORD=%s;", | |
"PREFETCH=%s;", | |
"OPTION=%d;", | |
sep=""), | |
"MySQL", | |
Sys.getenv("MZDB_HOST"), | |
Sys.getenv("MZDB_DB"), | |
Sys.getenv('MZDB_USER'), | |
Sys.getenv('MZDB_PW'), | |
batchSize, | |
bitOr(FLAG_NO_CACHE, FLAG_FORWARD_CURSOR)) | |
## Driver fails to fetch results unless `rows_at_time` is 1. | |
conn <- odbcDriverConnect(connection_string, rows_at_time=1) | |
run_id <- GetRunId(conn, opts[['run-id']], opts[['mzml_name']], opts[['raw-file-path']]) | |
if (!is.integer(run_id)) { | |
stop('Cannot find a run from the options given.') | |
} | |
window_bounds <- GetMassWindow(opts[['mass-center']], | |
opts[['mass-window']], | |
opts[['mass-lower']], | |
opts[['mass-upper']]) | |
if (is.null(window_bounds)) { | |
stop('Invalid window bounds') | |
} | |
query <- sprintf(paste('SELECT array_length, mz_array, intensity_array FROM spectra', | |
'WHERE run_id = %d ORDER BY run_index ASC'), | |
run_id) | |
if (odbcQuery(conn, query) < 1) { | |
stop(paste("Query failed. Error:", odbcGetErrMsg(conn))) | |
} | |
#res <- odbcFetchRows(conn, max=batchSize, buffsize=batchSize) | |
##' Decode `mz_array` and `intensity_array` into a matrix. | |
##' | |
##' @param array_length | |
##' Number of items in the two arrays. | |
##' @param mz_array | |
##' Raw binary data containing `array_length` little-endian-encoded doubles. | |
##' @param mz_array | |
##' Raw binary data containing `array_length` little-endian-encoded doubles. | |
##' | |
##' @return | |
##' Two-dimensional matrix with columns `mz_array` and `intensity_array`. | |
DecodeSpectrum <- function(array_length, mz_array, intensity_array) { | |
sapply(list(mz_array=mz_array, intensity_array=intensity_array), | |
readBin, | |
what="double", | |
endian="little", | |
n=array_length, USE.NAMES=FALSE) | |
} | |
##' Sums up the intensities of the datapoints within the mass window | |
SumInWindow <- function(spectrum, lower, upper) { | |
sum(spectrum[spectrum[,1] > lower & spectrum[,1] < upper, 2]) | |
} | |
##' Process a list of spectra. | |
##' | |
##' Typically, this will receive a shortened list; not the full spectrum. | |
ProcessSegment <- function(spectra, window_bounds) { | |
## Decode a single spectrum and sum the intensities within the window. | |
SumDecode <- function (lower, upper, ...) { | |
SumInWindow(DecodeSpectrum(...), lower, upper) | |
} | |
do.call("mapply", c(SumDecode, window_bounds, spectra)) | |
} | |
##' Repeatedly fetch a segment from `conn` and parse the results. | |
ProcessAllSegments <- function(conn, window_bounds) { | |
nextSeg <- function() odbcFetchRows(conn, max=batchSize, buffsize=batchSize) | |
while ((res <- nextSeg())$stat == 1 && res$data[[1]] > 0) { | |
ProcessSegment(res$data, window_bounds) | |
} | |
} | |
Rprof() | |
ProcessAllSegments(conn, window_bounds) | |
Rprof(NULL) | |
summaryRprof() | |
odbcClose(conn) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment