Skip to content

Instantly share code, notes, and snippets.

@aaronwolen
Created September 12, 2012 17:21
Show Gist options
  • Save aaronwolen/3708334 to your computer and use it in GitHub Desktop.
Save aaronwolen/3708334 to your computer and use it in GitHub Desktop.
Generate data.frame of feature annotations using Bioconductor
#' Generate data.frame of feature annotations
#'
#' Use bioconductor annotation packages to create a data.frame of feature/probe
#' annotations.
#'
#' @param chip character string identifying chip model (e.g., "illuminaHumanv2")
#' @param features optional character vector of chip features (i.e., probeset ids)
#' @param vars character vector of desired annotations. These must match objects
#' provided by the annotation package (e.g., "CHR")
#' @param duplicate.values how should duplicate values be handled? The default
#' approach is to "remove" them, leaving only the first value. Setting this to
#' "keep" will cause duplicates to be returned as a "|" delimited string.
#'
#' @examples
#' fdata.exp <- annotate_chip(chip = "illuminaHumanv2",
#' vars = c("symbol", "chr", "chrloc", "chrlocend"))
annotate_chip <- function(chip, features, vars, duplicate.values = "remove") {
duplicate.values <- match.arg(duplicate.values, c("remove", "keep"))
# Check for annotation package
pkg <- paste(sub("\\.db$", "", chip), "db", sep = ".")
if(suppressWarnings(!require(pkg, quietly = TRUE, character.only = TRUE))) {
stop(paste(pkg, "annotation package must be installed."), call. = FALSE)
}
# Available annotation data
available <- ls(paste("package", pkg, sep = ":"))
available <- available[!available %in% c(chip, pkg) & !grepl("_db", available)]
# Error message if vars is unset or invalid
var_error <- function(available) {
rds <- tools::Rd_db(pkg)
names(rds) <- sub("\\.Rd$", "", basename(names(rds)))
rds <- rds[names(rds) %in% available]
names(rds) <- sub(chip, "", names(rds))
details <- lapply(rds, tools:::.Rd_get_metadata, "title")
details <- unlist(lapply(details, paste, collapse = "\n\t\t"))
cat(paste("\n", names(rds), "\n\t\t", details, "\n", sep = ""), "\n")
stop("The vars argument requires a vector containing one or more of the ",
"variables listed above.\n", call. = FALSE)
}
if(missing(vars)) {
var_error(available)
}
if(missing(features)) {
features <- keys(eval(parse(text = pkg)))
}
# Format requests
requests <- paste(chip, toupper(sub(chip, "", vars)), sep = "")
# Check what's available
unavailable <- setdiff(requests, available)
requests <- setdiff(requests, unavailable)
# Nice labels
annot.labels <- tolower(gsub(chip, "", requests))
names(annot.labels) <- requests
if(length(requests) == 0) {
var_error(available)
}
if(length(unavailable) > 0) {
warning("The following variables are unavailable in ", pkg,
paste(":\n", unavailable, collapse = "\n"), call. = FALSE)
}
annots <- sapply(requests, simplify = FALSE, function(req)
mget(features, eval(parse(text = req)), ifnotfound = NA))
# Remove duplicate entries for each probe-set
if(duplicate.values == "remove") {
annots <- lapply(annots, function(y)
lapply(y, function(z) z[1]))
} else {
annots <- lapply(annots, function(y)
lapply(y, function(z) paste(z, collapse = " | ")))
}
# Covnert to data.frame
fdata <- data.frame(do.call("cbind", lapply(annots, unlist)),
stringsAsFactors = F)
names(fdata) <- annot.labels[names(fdata)]
# Convert columns to proper type
for(c in names(fdata)) {
fdata[, c] <- type.convert(fdata[, c])
}
# Extrapolate chromosome strand from location
if( "chrloc" %in% names(fdata)) {
if( any(fdata$chrloc < 0) ) {
fdata$chrstrand <- ifelse(fdata$chrloc > 0, "+", "-")
fdata$chrloc <- abs(fdata$chrloc)
}
}
return(fdata)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment