Last active
April 3, 2023 01:33
-
-
Save y9c/90da3e67ca6a31b73db7e7c7b6d77322 to your computer and use it in GitHub Desktop.
alternative polyadenylation (APA)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source("./utils_APA.R") | |
## How ot use? | |
## | |
## Change the path of tsv and gtf file | |
## `chr_name` is the column name of chromosome in the tsv file, default is "Chromosome" | |
## `pos_name` is the column name of position in the tsv file, default is "End" | |
## `strand_name` is the column name of strand in the tsv file, default is "Strand" | |
tsv <- "./test_sites.tsv" | |
gtf <- "~/reference/feature/Homo_sapiens.GRCh38.genome.gtf" | |
parse_PAS(tsv, gtf, chr_name = "Chromosome", pos_name = "End", strand_name = "Strand") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(GenomicFeatures) | |
library(APAlyzer) | |
.annotatePASRegion <- function(pasdb, dbreigon, dfTXGeneMapping, flag) { | |
x <- unlist(dbreigon) | |
x$IndexID <- 1:length(x) | |
x$transcript_id <- names(x) | |
names(x) <- x$IndexID | |
acc2sym <- data.frame(names(x), x$transcript_id) | |
colnames(acc2sym) <- c("IndexID", "transcript_id") | |
acc2sym <- merge(acc2sym, dfTXGeneMapping, on = "transcript_id", all.x = TRUE) | |
x <- x[acc2sym$IndexID] | |
names(x) <- acc2sym$gene_id | |
dbreigon <- split(x, names(x)) | |
dbreigon <- GenomicRanges::reduce(dbreigon) | |
olp <- findOverlaps(pasdb, dbreigon) | |
if (length(olp) > 0) { | |
hit <- pasdb[queryHits(olp), ] | |
hit$LOCATION <- flag | |
hit$PASid <- paste(seqnames(hit), start(hit), strand(hit), sep = ":") | |
hit <- as.data.frame(hit, row.names=1:length(hit)) | |
} else { | |
hit <- "NoHit" | |
} | |
return(hit) | |
} | |
.annotatePAS_V2 <- function(pasdb, TXDB, dfTXGeneMapping) { | |
## a.5UTR | |
fiveUTRs <- fiveUTRsByTranscript(TXDB, use.names = TRUE) | |
fiveUTRhit <- .annotatePASRegion(pasdb, fiveUTRs, dfTXGeneMapping, "fiveUTR") | |
## b.intron | |
introns <- intronsByTranscript(TXDB, use.names = TRUE) | |
intronhit <- .annotatePASRegion(pasdb, introns, dfTXGeneMapping, "intron") | |
## c.3UTR | |
threeUTRs <- threeUTRsByTranscript(TXDB, use.names = TRUE) | |
threeUTRhit <- .annotatePASRegion(pasdb, threeUTRs, dfTXGeneMapping, "threeUTR") | |
## d.CDS | |
cds <- cdsBy(TXDB, by = "tx", use.names = TRUE) | |
cdshit <- .annotatePASRegion(pasdb, cds, dfTXGeneMapping, "coding") | |
## e.all Exons | |
exon <- exonsBy(TXDB, by = "tx", use.names = TRUE) | |
exonhit <- .annotatePASRegion(pasdb, exon, dfTXGeneMapping, "Exon") | |
## Creat header | |
dbhit <- as.data.frame(pasdb[1]) | |
dbhit$LOCATION <- "NONE" | |
dbhit$PASid <- "NONE" | |
for (hit in list(fiveUTRhit, intronhit, threeUTRhit, cdshit, exonhit)) { | |
if (class(hit) == "data.frame") { | |
dbhit <- rbind(dbhit, hit) | |
} | |
} | |
dbhit <- dbhit[dbhit$PASid != "NONE", ] | |
finaldf <- dbhit[, c("PASid", "LOCATION", "gene_id")] | |
colnames(finaldf)[3] <- "GENEID" | |
finaldf$uniID1 <- paste(finaldf$PASid, finaldf$GENEID, finaldf$LOCATION, sep = ":") | |
finaldf$uniID2 <- paste(finaldf$PASid, finaldf$GENEID, sep = ":") | |
finaldf <- finaldf[!duplicated(finaldf), ] | |
finaldf$ORDERID <- 10 | |
if (nrow(finaldf[which(finaldf$LOCATION == "fiveUTR"), ]) > 0) { | |
finaldf[which(finaldf$LOCATION == "fiveUTR"), ]$ORDERID <- 1 | |
} | |
if (nrow(finaldf[which(finaldf$LOCATION == "intron"), ]) > 0) { | |
finaldf[which(finaldf$LOCATION == "intron"), ]$ORDERID <- 2 | |
} | |
if (nrow(finaldf[which(finaldf$LOCATION == "coding"), ]) > 0) { | |
finaldf[which(finaldf$LOCATION == "coding"), ]$ORDERID <- 3 | |
} | |
if (nrow(finaldf[which(finaldf$LOCATION == "threeUTR"), ]) > 0) { | |
finaldf[which(finaldf$LOCATION == "threeUTR"), ]$ORDERID <- 4 | |
} | |
finaldf <- finaldf[with(finaldf, order(uniID2, ORDERID)), ] | |
finaldf <- finaldf[!duplicated(finaldf$uniID2), ] | |
return(finaldf) | |
} | |
.GTF2refUTRraw <- function(TXDB, finaldf) { | |
####### CDS end ############ | |
cds <- cdsBy(TXDB, "gene") | |
x <- unlist(cds) | |
cds1 <- split(x, names(x)) | |
test <- as.data.frame(cds1) | |
index <- which(test$end - test$start > 1) | |
allcds <- test[index, ] | |
indexP <- which(allcds$strand == "+") | |
indexN <- which(allcds$strand == "-") | |
allcds$cdsend <- "" | |
allcds[indexP, ]$cdsend <- allcds[indexP, ]$end | |
allcds[indexN, ]$cdsend <- allcds[indexN, ]$start | |
MAX <- aggregate(cdsend ~ group_name, allcds, max) | |
MIN <- aggregate(cdsend ~ group_name, allcds, min) | |
names(MAX)[2] <- "max" | |
names(MIN)[2] <- "min" | |
maxmin <- merge(x = MAX, y = MIN, by = "group_name") | |
allcdssub <- allcds[, c("group_name", "strand")] | |
allcdssub <- allcdssub[!duplicated(allcdssub$group_name), ] | |
allcdsminmax <- merge(x = allcdssub, y = maxmin, by = "group_name") | |
indexP <- which(allcdsminmax$strand == "+") | |
indexN <- which(allcdsminmax$strand == "-") | |
allcdsminmax$cdsend <- "" | |
allcdsminmax[indexP, ]$cdsend <- allcdsminmax[indexP, ]$max | |
allcdsminmax[indexN, ]$cdsend <- allcdsminmax[indexN, ]$min | |
names(allcdsminmax)[1] <- "GENEID" | |
names(allcdsminmax)[2] <- "Strand" | |
allcdsminmax <- allcdsminmax[, c("GENEID", "Strand", "cdsend")] | |
##### 3UTR PAS ###### | |
df3UTR <- finaldf[which(finaldf$LOCATION == "threeUTR"), ] | |
df3UTR <- merge(df3UTR, allcdsminmax[, c("GENEID", "cdsend")], by = c("GENEID")) | |
## Filter 3UTR PAS by CDS | |
df3UTR <- df3UTR[which(((df3UTR$strand == "+") & (df3UTR$PAS > df3UTR$cdsend)) | ((df3UTR$strand == "-") & (df3UTR$PAS < df3UTR$cdsend))), ] | |
df3UTR_P <- df3UTR[which(df3UTR$strand == "+"), ] | |
df3UTR_P <- df3UTR_P[with(df3UTR_P, order(GENEID, PAS)), ] | |
df3UTR_N <- df3UTR[which(df3UTR$strand == "-"), ] | |
df3UTR_N <- df3UTR_N[with(df3UTR_N, order(GENEID, -PAS)), ] | |
df3UTR <- rbind(df3UTR_P, df3UTR_N) | |
df3UTR_count <- df3UTR %>% | |
group_by(GENEID) %>% | |
mutate(col = 1:n()) %>% ## create featureID | |
mutate(count = n()) | |
df3UTR_count <- data.frame(df3UTR_count) | |
df3UTR_count$pA_type <- "M" | |
if (nrow(df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count == 1), ]) > 0) { | |
df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count == 1), ]$pA_type <- "S" | |
} | |
if (nrow(df3UTR_count[which(df3UTR_count$col == df3UTR_count$count & df3UTR_count$count > 1), ]) > 0) { | |
df3UTR_count[which(df3UTR_count$col == df3UTR_count$count & df3UTR_count$count > 1), ]$pA_type <- "L" | |
} | |
if (nrow(df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count > 1), ]) > 0) { | |
df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count > 1), ]$pA_type <- "F" | |
} | |
dfF <- df3UTR_count[df3UTR_count$pA_type == "F", ][, c("GENEID", "Chr", "strand", "PAS", "gene_name")] | |
dfL <- df3UTR_count[df3UTR_count$pA_type == "L", ][, c("GENEID", "Chr", "strand", "PAS", "gene_name")] | |
colnames(dfF) <- c("GENEID", "Chrom", "Strand", "First", "gene_symbol") | |
colnames(dfL) <- c("GENEID", "Chrom", "Strand", "Last", "gene_symbol") | |
dfFL <- merge(dfF, dfL, by = c("GENEID", "Chrom", "Strand", "gene_symbol")) | |
dfFL <- dfFL[!duplicated(dfFL$GENEID), ] | |
dfFLCDE <- merge(dfFL, allcdsminmax, by = c("GENEID", "Strand")) | |
dfFLCDE <- dfFLCDE[which(((dfFLCDE$Strand == "+") & (dfFLCDE$First > dfFLCDE$cdsend)) | ((dfFLCDE$Strand == "-") & (dfFLCDE$First < dfFLCDE$cdsend))), ] | |
dfFLCDE$First <- as.integer(dfFLCDE$First) | |
dfFLCDE$cdsend <- as.integer(dfFLCDE$cdsend) | |
dfFLCDE$distance <- dfFLCDE$First - dfFLCDE$cdsend | |
dfFLCDE[which(dfFLCDE$Strand == "-"), ]$distance <- dfFLCDE[which(dfFLCDE$Strand == "-"), ]$cdsend - dfFLCDE[which(dfFLCDE$Strand == "-"), ]$First | |
index1 <- which(dfFLCDE$Strand == "+" & dfFLCDE$distance < 100) | |
dfFLCDE[index1, ]$cdsend <- dfFLCDE[index1, ]$cdsend - (100 - dfFLCDE[index1, ]$distance) | |
index2 <- which(dfFLCDE$Strand == "-" & dfFLCDE$distance < 100) | |
dfFLCDE[index2, ]$cdsend <- dfFLCDE[index2, ]$cdsend + (100 - dfFLCDE[index2, ]$distance) | |
refUTRraw <- dfFLCDE[, c("gene_symbol", "Chrom", "Strand", "First", "Last", "cdsend")] | |
refUTRraw <- refUTRraw[!is.na(refUTRraw$gene_symbol), ] | |
return(refUTRraw) | |
} | |
.GTF2IPA <- function(EDB, TXDB, finaldf) { | |
############ intron and SS ################ | |
introns <- intronsByTranscript(TXDB, use.names = TRUE) | |
dfintrons <- as.data.frame(introns) | |
colnames(dfintrons) <- c("group", "tx_id", "Chrom", "Start", "End", "width", "Strand") | |
# tx2gene <- mcols(transcripts(EDB, columns=c("tx_id", "gene_id",'gene_name'))) | |
# colnames(tx2gene)=c("tx_id", "GENEID",'gene_symbol') | |
tx2gene <- as.data.frame(EDB[which(EDB$type == "transcript"), ])[, c("transcript_id", "gene_id", "gene_name")] | |
colnames(tx2gene) <- c("tx_id", "GENEID", "gene_symbol") | |
tx2gene <- tx2gene[!duplicated(tx2gene), ] | |
dfintrons <- merge(dfintrons, tx2gene, by = "tx_id") | |
dfintrons <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Start", "End", "Strand", "width")] | |
dfintrons <- dfintrons[!duplicated(dfintrons), ] | |
## ANNO 5SS and 3SS## | |
dfintrons$SS5 <- dfintrons$Start | |
dfintrons$SS3 <- dfintrons$End | |
dfintrons[which(dfintrons$Strand == "-"), ]$SS5 <- dfintrons[which(dfintrons$Strand == "-"), ]$End | |
dfintrons[which(dfintrons$Strand == "-"), ]$SS3 <- dfintrons[which(dfintrons$Strand == "-"), ]$Start | |
## Combine 5SS and 3SS## | |
dfSS5 <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Strand", "SS5")] | |
dfSS3 <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Strand", "SS3")] | |
comname <- c("GENEID", "gene_symbol", "Chrom", "Strand", "upstreamSS") | |
colnames(dfSS5) <- comname | |
colnames(dfSS3) <- comname | |
dfSS5$type <- "SS5" | |
dfSS3$type <- "SS3" | |
dfupSS <- rbind(dfSS5, dfSS3) | |
dfupSS$SSID <- paste0(dfupSS$GENEID, dfupSS$gene_symbol, dfupSS$Chrom, dfupSS$Strand, dfupSS$upstreamSS) | |
dfupSS <- dfupSS[!duplicated(dfupSS$SSID), ] | |
############ build condidate downstream SS ################## | |
dfdnSS3 <- dfSS3 | |
dfdnSS3$SSID <- paste0(dfdnSS3$GENEID, dfdnSS3$gene_symbol, dfdnSS3$Chrom, dfdnSS3$Strand, dfdnSS3$upstreamSS) | |
dfdnSS3 <- dfdnSS3[!duplicated(dfdnSS3$SSID), ] | |
colnames(dfdnSS3) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "downstreamSS", "type", "SSID") | |
# dfdnSS3$Chrom=paste0('chr',dfdnSS3$Chrom) | |
# dfupSS$Chrom=paste0('chr',dfupSS$Chrom) | |
dfupSS <- dfupSS[!is.na(dfupSS$gene_symbol), ] | |
dfdnSS3 <- dfdnSS3[!is.na(dfdnSS3$gene_symbol), ] | |
######### IPA and SS ############ | |
dfIPA <- finaldf[(finaldf$LOCATION == "intron"), ] | |
dfIPA <- dfIPA[, c("PASid", "Chr", "PAS", "strand", "GENEID", "gene_name")] | |
colnames(dfIPA) <- c("PASid", "Chrom", "Pos", "Strand", "GENEID", "gene_symbol") | |
dfIPA <- dfIPA[!duplicated(dfIPA), ] | |
dfIPA <- dfIPA[!is.na(dfIPA$gene_symbol), ] | |
DFIPASS <- list(dfIPA, dfupSS, dfdnSS3) | |
names(DFIPASS) <- c("dfIPA", "dfupSS", "dfdnSS3") | |
return(DFIPASS) | |
} | |
.GTF2LE <- function(TXDB, dfIPAXXX, dfupSSXXX, dfdnSS3XXX) { | |
dfIPAXXX <- as.data.frame(dfIPAXXX) | |
dfupSSXXX <- as.data.frame(dfupSSXXX) | |
dfdnSS3XXX <- as.data.frame(dfdnSS3XXX) | |
############ Hit upstream SS ################## | |
dfHIT_up <- merge(dfIPAXXX, dfupSSXXX, by = c("GENEID", "gene_symbol", "Chrom", "Strand")) | |
dfHIT_up$dis <- dfHIT_up$Pos - dfHIT_up$upstreamSS | |
dfHIT_up[which(dfHIT_up$Strand == "-"), ]$dis <- dfHIT_up[which(dfHIT_up$Strand == "-"), ]$upstreamSS - dfHIT_up[which(dfHIT_up$Strand == "-"), ]$Pos | |
dfHIT_up$HITID <- paste0(dfHIT_up$GENEID, dfHIT_up$gene_symbol, dfHIT_up$PASid) | |
dfHIT_up <- dfHIT_up[which(dfHIT_up$dis > 100), ] | |
dfHIT_up <- dfHIT_up[with(dfHIT_up, order(HITID, dis)), ] | |
dfHIT_up <- dfHIT_up[!duplicated(dfHIT_up$HITID), ] | |
dfHIT_up <- dfHIT_up[, c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "upstreamSS", "type", "dis")] | |
colnames(dfHIT_up) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "upstreamSS", "type", "upstream_dis") | |
############ Hit downstream SS ################## | |
dfHIT_dn <- merge(dfIPAXXX, dfdnSS3XXX, by = c("GENEID", "gene_symbol", "Chrom", "Strand")) | |
dfHIT_dn$dis <- dfHIT_dn$downstreamSS - dfHIT_dn$Pos | |
dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$dis <- dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$Pos - dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$downstreamSS | |
dfHIT_dn$HITID <- paste0(dfHIT_dn$GENEID, dfHIT_dn$gene_symbol, dfHIT_dn$PASid) | |
dfHIT_dn <- dfHIT_dn[which(dfHIT_dn$dis > 100), ] | |
dfHIT_dn <- dfHIT_dn[with(dfHIT_dn, order(HITID, dis)), ] | |
dfHIT_dn <- dfHIT_dn[!duplicated(dfHIT_dn$HITID), ] | |
dfHIT_dn <- dfHIT_dn[, c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "downstreamSS", "dis")] | |
colnames(dfHIT_dn) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "downstreamSS", "downstream_dis") | |
############ combine IPA SS ################## | |
dfHIT_combine <- merge(dfHIT_up, dfHIT_dn, by = c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos")) | |
dfHIT_combine <- dfHIT_combine[!duplicated(dfHIT_combine), ] | |
dfHIT_combine$IPA_up_start <- dfHIT_combine$upstreamSS | |
dfHIT_combine$IPA_up_end <- dfHIT_combine$Pos | |
dfHIT_combine$IPA_dn_start <- dfHIT_combine$Pos | |
dfHIT_combine$IPA_dn_end <- dfHIT_combine$downstreamSS | |
dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_up_start <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$Pos | |
dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_up_end <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$upstreamSS | |
dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_dn_start <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$downstreamSS | |
dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_dn_end <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$Pos | |
dfIPAREF <- dfHIT_combine[, c("PASid", "GENEID", "gene_symbol", "Chrom", "Strand", "Pos", "upstreamSS", "downstreamSS")] | |
dfIPAREF <- as.data.frame(dfIPAREF) | |
dfIPAREF <- dfIPAREF[!duplicated(dfIPAREF), ] | |
dfIPAsim <- dfHIT_combine[, c("PASid", "gene_symbol", "Chrom", "Strand", "Pos", "upstreamSS", "downstreamSS")] | |
dfIPAsim <- as.data.frame(dfIPAsim) | |
dfIPAsim <- dfIPAsim[!duplicated(dfIPAsim), ] | |
dfIPAsim <- dfIPAsim[!is.na(dfIPAsim$gene_symbol), ] | |
############ TES ############## | |
dfgenic <- as.data.frame(genes(TXDB)) | |
dfgenic <- dfgenic[, c(6, 1:3, 5)] | |
names(dfgenic) <- c("GENEID", "Chr", "Start", "End", "Strand") | |
# find the TES | |
dfMinMAX <- dfgenic | |
dfMinMAX$TES <- "" | |
indexN <- which(dfMinMAX$Strand == "-") | |
indexP <- which(dfMinMAX$Strand == "+") | |
dfMinMAX[indexP, ]$TES <- dfMinMAX[indexP, ]$End | |
dfMinMAX[indexN, ]$TES <- dfMinMAX[indexN, ]$Start | |
dfTES <- dfMinMAX[, c("GENEID", "Chr", "TES", "Strand")] | |
dfTES <- dfTES[!duplicated(dfTES), ] | |
### I exon### | |
exondb <- exonsBy(TXDB, by = "gene") | |
exondb <- GenomicRanges::reduce(exondb) | |
exonsdf <- as.data.frame(exondb) | |
exonsdfP <- exonsdf[which(exonsdf$strand == "+"), ] | |
exonsdfN <- exonsdf[which(exonsdf$strand == "-"), ] | |
exonsdfN <- exonsdfN[with(exonsdfN, order(group_name, -start)), ] | |
exonsdf <- rbind(exonsdfP, exonsdfN) | |
exonsdf <- exonsdf[, -c(1)] | |
data_EXrename <- exonsdf %>% | |
group_by(group_name) %>% | |
mutate(col = 1:n()) %>% ## create exonID | |
mutate(count = n()) | |
exonsdf <- data.frame(data_EXrename) | |
names(exonsdf)[7:8] <- c("EXid", "EXcount") | |
dfLexon <- exonsdf[which(exonsdf$EXid == exonsdf$EXcount), ] | |
dfLexon$LEstart <- dfLexon$start | |
dfLexon[which(dfLexon$strand == "-"), ]$LEstart <- dfLexon[which(dfLexon$strand == "-"), ]$end | |
dfLexon <- dfLexon[, c("group_name", "seqnames", "strand", "LEstart")] | |
colnames(dfLexon) <- c("GENEID", "Chr", "Strand", "LEstart") | |
dfLE <- merge(dfLexon, dfTES, by = c("GENEID", "Chr", "Strand")) | |
dfLE <- dfLE[!duplicated(dfLE), ] | |
dfIPAused <- dfIPAREF[, c("GENEID", "gene_symbol")] | |
dfIPAused <- dfIPAused[!duplicated(dfIPAused), ] | |
dfLE <- merge(dfLE, dfIPAused, by = "GENEID") | |
dfLE <- dfLE[!duplicated(dfLE), ] | |
# dfLE$Chr=paste0('chr',dfLE$Chr) | |
dfLE <- dfLE[, c("gene_symbol", "Chr", "Strand", "LEstart", "TES")] | |
colnames(dfLE) <- c("gene_symbol", "Chrom", "Strand", "LEstart", "TES") | |
dfLE <- dfLE[!is.na(dfLE$gene_symbol), ] | |
dfLE_dfIPA <- list(dfIPAsim, dfLE) | |
names(dfLE_dfIPA) <- c("dfIPAsim", "dfLE") | |
return(dfLE_dfIPA) | |
} | |
.get_input_pos <- function(x, y) { | |
y2 <- y | |
start(y2) <- start(x) | |
end(y2) <- end(x) | |
return(y2) | |
} | |
parse_PAS <- function(tsv, gtf, chr_name = "Chromosome", pos_name = "End", strand_name = "Strand", remove_suffix = FALSE) { | |
# parse PAS sites | |
print("Read PAS sites from input tsv file") | |
df_input <- read.table(tsv, sep = "\t", header = TRUE) | |
if (remove_suffix) { | |
chr_col <- sub("chr", "", df_input[[chr_name]]) | |
} else { | |
chr_col <- df_input[[chr_name]] | |
} | |
gr_input <- GRanges( | |
seqnames = chr_col, | |
ranges = IRanges(start = df_input[[pos_name]], end = df_input[[pos_name]]), | |
strand = df_input[[strand_name]] | |
) | |
print("Read gene model from gtf file") | |
EDB <- rtracklayer::import(gtf) | |
TXDB <- makeTxDbFromGRanges(EDB) | |
print("Annotate PAS sites with gene model") | |
ol <- findOverlaps(gr_input, EDB, ignore.strand = FALSE, type = "within", select = "all") | |
gr_annot <- unlist(mendoapply(.get_input_pos, split(gr_input[queryHits(ol)], 1:length(ol)), y = split(EDB[subjectHits(ol)], 1:length(ol)))) | |
dfPAS <- as.data.frame(gr_annot[which(gr_annot$type == "transcript"), ][, c("transcript_id", "gene_id", "gene_name")]) | |
dfPAS$PAS <- dfPAS$start | |
dfTXGeneMapping <- dfPAS[, c("transcript_id", "gene_id")] | |
dfTXGeneMapping <- dfTXGeneMapping[!duplicated(dfTXGeneMapping), ] | |
dfTXGeneMapping <- dfTXGeneMapping[!is.na(dfTXGeneMapping$transcript_id), ] | |
pasdb <- makeGRangesFromDataFrame(dfPAS, keep.extra.columns = TRUE) | |
finaldf <- .annotatePAS_V2(pasdb, TXDB, dfTXGeneMapping) | |
dfPAS$Chr <- dfPAS$seqnames | |
dfPAS$PASid <- paste(dfPAS$seqnames, dfPAS$PAS, dfPAS$strand, sep = ":") | |
dfinfo <- dfPAS[, c("PASid", "Chr", "strand", "PAS", "gene_id", "gene_name")] | |
colnames(dfinfo) <- c("PASid", "Chr", "strand", "PAS", "GENEID", "gene_name") | |
finaldf <- merge(finaldf, dfinfo, by = c("PASid", "GENEID")) | |
# output PAS objects | |
print("Extracting and filtering 3'UTR PASs") | |
refUTRraw <- .GTF2refUTRraw(TXDB, finaldf) | |
print("Extracting IPAs") | |
dfIPAALL <- .GTF2IPA(EDB, TXDB, finaldf) | |
print("Extracting 3' last exons") | |
dfIPAXXX <- dfIPAALL$dfIPA | |
dfupSSXXX <- dfIPAALL$dfupSS | |
dfdnSS3XXX <- dfIPAALL$dfdnSS3 | |
dfLE_dfIPA <- .GTF2LE(TXDB, dfIPAXXX, dfupSSXXX, dfdnSS3XXX) | |
print("Finalizing references") | |
PASREF <- list(refUTRraw, dfLE_dfIPA$dfIPAsim, dfLE_dfIPA$dfLE) | |
names(PASREF) <- c("refUTRraw", "dfIPA", "dfLE") | |
return(PASREF) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment