Created
May 24, 2020 09:46
-
-
Save ATpoint/129fb17a6c0ed07a4e63bdb943a6d37f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Script to create a transcriptome fasta file that contains both spliced and unspliced | |
## transcripts based on a GTF file from GENCODE. | |
## Also see the preprint https://doi.org/10.1101/2020.03.13.990069 from Charlotte Soneson on the matter of | |
## how preprocessing influences the outcome of velocity analysis. | |
## The below steps are inspired by the preprint and personal correspondence with CS. | |
ANALYSISDIR="./" | |
dir.create(paste0(ANALYSISDIR, "/Alevin_IDX")) | |
## eisaR: devtools::install_github("https://github.com/fmicompbio/eisaR", ref = "0de32247a0640336b73a92a52c8ad993c1012024") | |
library(eisaR) | |
library(seqinr) | |
library(Biostrings) | |
library(rtracklayer) | |
library(RCurl) | |
library(curl) | |
library(BSgenome.Mmusculus.UCSC.mm10) | |
################################################################################################################################ | |
## Get the GTF from GENCODE (we use v23 to be consistent with our bulk RNA-seq data): | |
URL.GTF = "ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/gencode.vM23.annotation.gtf.gz" | |
con = curl(URL.GTF, "r", new_handle(dirlistonly=TRUE)) | |
## First we extract the spliced transcripts | |
mm10.tx <- extractTxSeqs(gtf = URL.GTF, | |
type = "spliced", | |
genome = BSgenome.Mmusculus.UCSC.mm10) | |
## now extract introns with eisaR::extractIntronSeqs. | |
## flanklength is the read length of the R2 read, so 98 based on the 10X recommendations | |
mm10.introns <- extractIntronSeqs(gtf = URL.GTF, | |
genome = BSgenome.Mmusculus.UCSC.mm10, | |
type = "collapse", | |
flanklength = 98 - 1) | |
## We remove the version info from the Ensembl transcript names and | |
## add the "_Intron" suffix to separate it from the spliced transcripts | |
names(mm10.tx) <- gsub("\\..*", "", names(mm10.tx)) | |
names(mm10.introns) <- sapply(strsplit(names(mm10.introns), split = "-"), function(x){ | |
return(paste( gsub("\\..*", "", x[1]), gsub("I", "Intron", x[2]), sep = "_" )) | |
}) | |
## combine into one fasta file, write to disk, file is then ready for indexing with Salmon/Alevin | |
writeXStringSet(x = c(mm10.tx, mm10.introns), compress = FALSE, | |
filepath = paste0(ANALYSISDIR, "/Alevin_IDX/tx_withIntrons.fa")) | |
## Make a tx2gene map (for tximport) and | |
## a list of mitochondrial and rRNA gene | |
gtf <- import(con = URL.GTF, format = "GTF") | |
## select transcripts: | |
gtf.tx <- gtf[gtf$type == "transcript",] | |
## spliced transcripts. we make gene names of the form ENSMUSG_MGI, | |
## so a name where both the Ensembl gene id and the MGI gene name are included | |
spliced.tx2gene <- data.frame(Tx = gsub("\\..*", "", gtf.tx$transcript_id), | |
Gene = paste(gsub("\\..*", "", gtf.tx$gene_id), gtf.tx$gene_name, sep="_"), | |
stringsAsFactors = FALSE) | |
## unspliced transcripts: | |
unspliced.tx2gene <- data.frame(Tx = names(mm10.introns), | |
Gene = sapply(strsplit(names(mm10.introns), split = "_Intron"), function(x)paste0(x[1]))) | |
## combine: | |
unspliced.tx2gene <- merge(x = unspliced.tx2gene, | |
y = data.frame(spliced.tx2gene, | |
Tmp = sapply(strsplit(spliced.tx2gene$Gene, split = "_"), function(x)x[1])), | |
by.x = "Gene", by.y = "Tmp") | |
## final tx2gene data.frame: | |
tx2gene <- rbind(spliced.tx2gene, data.frame(Tx = unspliced.tx2gene$Tx.x, | |
Gene = paste0(unspliced.tx2gene$Gene.y, "_unspliced"))) | |
## to disk: | |
write.table(x = tx2gene, sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE, | |
file = paste0(ANALYSISDIR, "/Alevin_IDX/tgmap.txt")) | |
## Get the mtGenes and rRNA: | |
tmp.mt <- gtf[seqnames(gtf) == "chrM",] | |
tmp.mt <- unique(paste(gsub("\\..*", "", tmp.mt$gene_id), tmp.mt$gene_name, sep="_")) | |
tmp.rR <- gtf[gtf$gene_type == "rRNA",] | |
tmp.rR <- unique(paste(gsub("\\..*", "", tmp.rR$gene_id), tmp.rR$gene_name, sep="_")) | |
## Save as RDS: | |
if(!dir.exists(paste0(ANALYSISDIR, "/RDS"))) dir.create(paste0(ANALYSISDIR, "/RDS")) | |
saveRDS(object = list(mt_Genes = tmp.mt, | |
rRNA_Genes = tmp.rR), | |
file = (paste0(ANALYSISDIR, "/RDS/mt_rRNA_Genes.rds"))) | |
write.table(x = data.frame(mtGenes = tmp.mt), | |
sep = "\n", col.names = FALSE, row.names = FALSE, quote = FALSE, | |
file = paste0(ANALYSISDIR, "/Alevin_IDX/mtGenes.txt")) | |
write.table(x = data.frame(rRNA = tmp.rR), | |
sep = "\n", col.names = FALSE, row.names = FALSE, quote = FALSE, | |
file = paste0(ANALYSISDIR, "/Alevin_IDX/rrnaGenes.txt")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment