-
-
Save sminot/34bc815fbb9976a40af47153e84769df to your computer and use it in GitHub Desktop.
Sample R script for generating a CSV with One Codex analysis results, one column per sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/Rscript | |
# Copyright Reference Genomics, Inc. 2016 | |
# Released under the MIT License | |
# | |
# Script for fetching analysis results from the One Codex API (v0) | |
# See https://docs.onecodex.com for full documentation on the REST API | |
# | |
# Script can be run from the command line with: | |
# `Rscript fetch_ocx_analyses.R <API_KEY> <FILE> [-d DB] [-b BEGINNING_DATE -e ENDING_DATE]` | |
# | |
library(httr) | |
library(plyr) | |
library(reshape2) | |
library(docopt) | |
# 1. Get the list of all analyses for this account | |
# | |
# Note: This returns all metagenomic classification results | |
# for all samples and reference database permutations | |
ListAnalyses <- function(api_key) { | |
r <- GET(url="https://app.onecodex.com/api/v0/analyses", | |
authenticate(api_key, '')) | |
return(content(r)) | |
} | |
# 2. Function to fetch the results of a given analyses | |
# Returns a dataframe with columns including: | |
# name, rank, tax_id, readcount, and readcount_w_children | |
# | |
# Note: We use the /extended_table route here | |
# which provides a few extra fields and records, | |
# but is otherwise similar to the documented /table | |
# REST route (including entries for taxonomic nodes | |
# to which 0 reads map, e.g., family nodes, as well as | |
# species-level abundance estimates where available) | |
GetAnalysisResult <- function(api_key, analysis) { | |
url <- paste("https://app.onecodex.com/api/v0/analyses", | |
analysis$id, | |
"extended_table", | |
sep='/') | |
r <- GET(url=url, | |
authenticate(api_key, '')) | |
if (status_code(r) != 200) { | |
print(paste("Not able to fetch results for", analysis$sample_filename)) | |
return(data.frame()) | |
} | |
# Convert JSON content into a data.frame | |
r <- content(r) | |
r <- ldply(r, data.frame) | |
# Add the analysis$sample_filename name into the data.frame | |
r$sample_filename <- analysis$sample_filename | |
r$id <- analysis$id | |
r$sample_id <- analysis$sample_id | |
r$reference_name <- analysis$reference_name | |
r$reference_id <- analysis$reference_id | |
r$n_mapped_reads <- sum(r$readcount) | |
return(r) | |
} | |
# 3. Function to get the information about a given sample, | |
# not the analytical results, but the upload date, etc. | |
GetSampleInfo <- function(api_key, analysis) { | |
url <- paste("https://app.onecodex.com/api/v0/samples", | |
analysis$sample_id, | |
sep='/') | |
r <- GET(url=url, | |
authenticate(api_key, '')) | |
if (status_code(r) != 200) { | |
print(paste("Not able to fetch sample information for", | |
analysis$sample_filename)) | |
return(c()) | |
} | |
# Convert JSON content into a list | |
r <- content(r) | |
return(r) | |
} | |
# 4. Function to grab all analyses and combine them into a "wide" | |
# dataframe (optionally filtered by filename or ID; reference database | |
# name or ID; or a range of dates (inclusive) ) | |
GetAggregatedAnalyses <- function(api_key, | |
reference_dbs=c("One Codex Database"), | |
date_range=c()) { | |
analysis_list <- ListAnalyses(api_key) | |
analysis_results <- data.frame(stringsAsFactors=FALSE) | |
for (a in analysis_list) { | |
# Skip analyses for non-specified reference databases as appropriate | |
if (length(reference_dbs) > 0 && | |
!(a$reference_name %in% reference_dbs) && | |
!(a$reference_id %in% reference_dbs)) { | |
next | |
} | |
# Skip analyses outside of the specified date range | |
if (length(date_range) > 0){ | |
sample_info <- GetSampleInfo(api_key, a) | |
upload_date <- sample_info$upload_date | |
if (!date_in_range(upload_date, date_range)) { | |
next | |
} | |
} | |
# Add fetched rows to analysis_results dataframe | |
result <- GetAnalysisResult(api_key, a) | |
if (length(result) == 0) { | |
next | |
} | |
if (length(date_range) > 0){ | |
print(paste("Fetched results for", a$sample_filename, upload_date)) | |
} else { | |
print(paste("Fetched results for", a$sample_filename)) | |
} | |
output_fields <- c('name', 'rank', 'readcount_w_children', | |
'tax_id', 'sample_filename', 'id', | |
'sample_id', 'reference_name', 'reference_id', | |
'n_mapped_reads') | |
result <- subset(result, select=output_fields) | |
analysis_results <- rbind(analysis_results, result) | |
} | |
if (nrow(analysis_results) == 0) { | |
print("No results found, exiting.") | |
return() | |
} | |
# Reshape the results into a wide table | |
analysis_wide <- dcast(analysis_results, | |
name + rank + tax_id ~ sample_id, | |
value.var='readcount_w_children', | |
fun.aggregate = mean) | |
analysis_wide[is.na(analysis_wide)] <- 0 | |
sample_ids <- names(analysis_wide)[4:length(names(analysis_wide))] | |
names(analysis_results)[names(analysis_results) == 'id'] <- 'analysis_id' | |
for (field in c('n_mapped_reads', 'reference_id', 'reference_name', | |
'analysis_id', 'sample_filename')) { | |
field_wide <- data.frame(name=c(field), | |
rank=c(''), | |
tax_id=c(''), | |
stringsAsFactors=FALSE) | |
for (s in sample_ids) { | |
field_wide[s] <- head(subset(analysis_results, sample_id == s), 1)[field] | |
} | |
analysis_wide <- rbind(field_wide, analysis_wide) | |
} | |
return(analysis_wide) | |
} | |
# 5. Function to convert a string date YYYY-MM-DD to a vector of numerics | |
as_date <- function(str) { | |
return(do.call(c, lapply(strsplit(str, '-')[[1]], as.numeric))[1:3]) | |
} | |
# 6. Function to check whether a date fits the YYYY-MM-DD format | |
is_date <- function(str) { | |
fields <- as_date(str) | |
if (fields[1] > 1900 && | |
fields[2] <= 12 && fields[2] >=1 && | |
fields[3] <=31 && fields[3] >= 1) { | |
return(TRUE) | |
} else { | |
return(FALSE) | |
} | |
} | |
# 7. Function to check whether a date fits into a date range (inclusive) | |
date_in_range <- function(date, date_range) { | |
date <- as.Date(date, format = "%Y-%m-%d") | |
if(!is.null(date_range[1])){ | |
if (date < date_range[1]){ | |
return(FALSE) | |
} | |
} | |
if(!is.null(date_range[2])){ | |
if (date > date_range[2]){ | |
return(FALSE) | |
} | |
} | |
return(TRUE) | |
} | |
# The commands below allow this script to be run from the command line | |
"Usage: fetch_ocx_analyses.R <API_KEY> <FILE> [-d DB] [-b BEGINNING_DATE -e ENDING_DATE] | |
API_KEY specify API key | |
FILE specify output file | |
-d DB, --database DB only fetch analysis that use this database [default: One Codex Database] | |
-b BEGINNING_DATE, --beginning-date BEGINNING_DATE | |
only fetch analysis on this date or later (e.g. 2016-05-22) | |
-e ENDING_DATE, --ending-date ENDING_DATE | |
only fetch analysis on this date or earlier (e.g. 2016-05-23) | |
-h --help show this help text" -> doc | |
docopt(doc) -> opts | |
print(paste("Downloading analysis using API KEY", | |
opts$API_KEY, | |
"using the", | |
opts$database, | |
"and saving to", | |
opts$FILE)) | |
# Check to see that the dates are valid, if specified | |
if(opts$b) { | |
if (is_date(opts$BEGINNING_DATE)){ | |
print(paste("Downloading only samples on or after: ", opts$BEGINNING_DATE)) | |
opts$BEGINNING_DATE<-as.Date(opts$BEGINNING_DATE, format = "%Y-%m-%d") | |
} else{ | |
stop("Must specify valid beginning date (example: 2016-05-22") | |
} | |
} | |
if(opts$e) { | |
if (is_date(opts$ENDING_DATE)){ | |
print(paste("Downloading only samples on or before: ", opts$ENDING_DATE)) | |
opts$ENDING_DATE<-as.Date(opts$ENDING_DATE, format = "%Y-%m-%d") | |
} else{ | |
stop("Must specify valid ending date (example: 2016-05-23") | |
} | |
} | |
analyses <- GetAggregatedAnalyses(opts$API_KEY, | |
reference_dbs=c(opts$database), | |
date_range=c(opts$BEGINNING_DATE, opts$ENDING_DATE)) | |
write.table(analyses, file=opts$FILE, sep=',', row.names=FALSE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment