Created
September 18, 2012 15:23
-
-
Save jwbargsten/3743732 to your computer and use it in GitHub Desktop.
Create TranscriptDb objects from GFF3 files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## source: http://permalink.gmane.org/gmane.science.biology.informatics.conductor/40247 | |
require(plyr) | |
require(rtracklayer) | |
require(GenomicRanges) | |
require(GenomicFeatures) | |
## determine the rank for all exons of a transcript | |
## the rank is reversed, if the strand is negative | |
.exon_rank_order <- function(exon.starts, strands) { order(exon.starts, decreasing=strands[1] == '-') } | |
makeTranscriptDbFromGFF3 <- function(file, allow.unknown.strand=TRUE, metadata=data.frame(name="genome", value="unknown_species")) { | |
data <- import(file, format="gff3", asRangedData=FALSE) | |
## omit everything with unknown strand | |
if(!allow.unknown.strand) | |
data <- data[which( strand(data) %in% c('+','-'))] | |
## get the corresponding subsets | |
data.mrna <- data[values(data)$type == 'mRNA'] | |
data.exon <- data[values(data)$type == 'exon'] | |
## build the transcripts data frame | |
transcripts <- data.frame( | |
tx_id=seq(along=data.mrna), | |
tx_name=as.character(values(data.mrna)$ID), | |
tx_chrom=as.character(seqnames(data.mrna)), | |
tx_strand=as.character(strand(data.mrna)), | |
tx_start=min(ranges(data.mrna)), | |
tx_end=max(ranges(data.mrna)) | |
) | |
## build the basic structure for the splicings data frame | |
## take care of exons with more than one parent (**1**) | |
exon.parentList <- as.list(values(data.exon)$Parent) | |
exon.parentList.idxmap <- unlist(lapply(seq(along=exon.parentList), function(x) { rep(x, length(exon.parentList[[x]])) })) | |
exons.raw <- data.exon[exon.parentList.idxmap] | |
exon.tmp <- data.frame( | |
exon_name=as.character(values(exons.raw)$ID), | |
exon_strand=as.character(strand(exons.raw)), | |
exon_start=start(ranges(exons.raw)), | |
exon_end=end(ranges(exons.raw)), | |
exon_parent=unlist(exon.parentList), | |
stringsAsFactors=FALSE | |
) | |
## determine the corresponding transcript indices ... | |
exon.tmp <- transform(exon.tmp, tx_idx=match(exon_parent, transcripts$tx_name)) | |
## ... and use them to assign the transcript ids and the strands | |
exon.tmp <- transform(exon.tmp, tx_id=transcripts$tx_id[tx_idx], tx_strand=transcripts$tx_strand[tx_idx]) | |
## every exon rank is based on the start coord. and the strand of the corresponding transcript (the parent feature) | |
exon.tmp <- ddply(exon.tmp, .(exon_strand, tx_id), transform, exon_rank = .exon_rank_order(exon_start, tx_strand)) | |
## extract the relevant columns for the splicings data frame | |
splicings <- exon.tmp[c("tx_id", "exon_rank", "exon_strand", "exon_start", "exon_end")] | |
## just bluntly assume that transcripts have not more than one parent (the gene), hence use "unlist()" directly | |
## and skip the stuff done in (**1**) | |
genes <- data.frame(tx_id=seq(along=data.mrna), gene_id=unlist(values(data.mrna)$Parent)) | |
## extract chromosome names | |
chrom <- levels(seqnames(data)) | |
db <- makeTranscriptDb( | |
transcripts, | |
splicings, | |
genes=genes, | |
metadata=metadata, | |
chrominfo=data.frame(chrom=chrom, length=rep(NA, length(chrom))) | |
) | |
db | |
} | |
## TEST IT | |
file <- system.file("tests", "genes.gff3",package="rtracklayer") | |
txdb <- makeTranscriptDbFromGFF3(file) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment