get rid of the digits (gene version) in the end for the gene names (gencode v19)
cat STAR_WT-30393468_htseq.cnt| sed -E 's/\.[0-9]+//' > WT_htseq.cnt
transcript to gene mapping file:
library(EnsDb.Hsapiens.v75)
edb <- EnsDb.Hsapiens.v75
Tx.ensemble <- transcripts(edb,
columns = c("tx_id", "gene_id", "gene_name"),
return.type = "DataFrame")
nrow(Tx.ensemble)
tx2gene<- Tx.ensemble[,c(1,2)]
write.table(tx2gene, "~/playground/gene-level-effective-length/tx2gene.tsv", col.names = F, row.names = F, sep="\t", quote=F)
add #! /bin/env python
to the head of the python script Rob wrote.
#! /bin/env python
import argparse
import pandas as pd
import numpy as np
import sys
def main(args):
gtable = pd.read_table(args.ginput).set_index('Name')
ttable = pd.read_table(args.tinput).set_index('Name')
tgmap = pd.read_table(args.tgmap, names=['t', 'g']).set_index('t')
gene_lengths = {}
j = 0
# Map over all gene groups (a gene and its associated transcripts)
for g, txps in tgmap.groupby('g').groups.iteritems():
if j % 500 == 1:
print("Processed {} genes".format(j))
j += 1
# The set of transcripts present in our salmon index
tset = []
for t in txps:
if t in ttable.index:
tset.append(t)
# If at least one of the transcripts was present
if len(tset) > 0:
# The denominator is the sum of all TPMs
totlen = ttable.loc[tset,'TPM'].sum()
# Turn the relative TPMs into a proper partition of unity
if totlen > 0:
tpm_fracs = ttable.loc[tset, 'TPM'].values / ttable.loc[tset,'TPM'].sum()
else:
tpm_fracs = np.ones(len(tset)) / float(len(tset))
# Compute the gene's effective length as the abundance-weight
# sum of the transcript lengths
elen = 0.0
for i,t in enumerate(tset):
elen += tpm_fracs[i] * ttable.loc[t, 'EffectiveLength']
gene_lengths[g] = elen
# Give the table an effective length field
gtable['EffectiveLength'] = gtable.apply(lambda r : gene_lengths[r.name] if r.name in gene_lengths else 1.0, axis=1)
# Write it to the output file
gtable.to_csv(args.output, sep='\t', index_label='Name')
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compute gene-level effective lengths")
parser.add_argument('--ginput', type=str, help='gene level input table')
parser.add_argument('--tinput', type=str, help='transcript level input table')
parser.add_argument('--tgmap', type=str, help='transcript -> gene mapping')
parser.add_argument('--output', type=str, help='output table with extra column')
args = parser.parse_args()
main(args)
chmod u + x glef.py
/glef.py --ginput WT_htseq.cnt --tinput WT_quant.sf --tgmap tx2gene.tsv --output WT_htseq_effective_length.tsv
compare effecitve length derived from transcript level and salmon output gene level
WT.count.df<- read.table("~/playground/gene-level-effective-length/WT_htseq_effective_length.tsv",
sep ="\t", header=T, stringsAsFactors = F)
head(counts.df)
counts.effective.df<- left_join(counts.df, WT.count.df, by=c("gene_name"="Name"))
## scatter plot of effective length from gene level output of salmon and effective length derived from transcript level salmon output
plot(log2(counts.effective.df$EffectiveLength.x+1), log2(counts.effective.df$EffectiveLength.y+1), xlab= "log2 effective gene length from salmon gene level output", ylab="log2 effective gene length derived from transcript level")
abline(a=0, b=1, col="red")
## convert to TPM use transcript derived effective gene length
WT_count<- as.matrix(counts.effective.df[,2])
count2Tpm <- function(counts, effLen)
{
rate <- log(counts) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
WT.TPM.from.HTSeq<- count2Tpm(WT_count, counts.effective.df$EffectiveLength.y)
WT.TPM.from.HTSeq<- cbind(as.data.frame(WT.TPM.from.HTSeq), gene_name=counts.effective.df$gene_name)
colnames(WT.TPM.from.HTSeq)<- c("WT.htseq.TPM", "gene_name")
head(TPM.from.HTSeq)
dim(salmon.TPM)
WT.TPM.HTseq.salmon<- left_join(WT.TPM.from.HTSeq, salmon.TPM)
WT.TPM.HTseq.salmon.kallisto<- left_join(WT.TPM.HTseq.salmon, kallisto.TPM)
## HTSeq TPM versus salmon TPM
ggplot(WT.TPM.HTseq.salmon.kallisto,aes(x=log2(WT.htseq.TPM+1), y=log2(salmon.WT+1))) + geom_point() + geom_smooth(method="lm") + geom_abline(slope=1, intercept = 0, color="red") + annotate("text", x=13, y=20, label= "spearman cor = 0.85") + ggtitle("HTseq TPM versus salmon TPM")
cor(log2(WT.TPM.HTseq.salmon$WT.htseq.TPM+1), log2(WT.TPM.HTseq.salmon$salmon.WT+1), method="spearman")
## HTSeq TPM versus kallisto TPM
ggplot(WT.TPM.HTseq.salmon.kallisto,aes(x=log2(WT.htseq.TPM+1), y=log2(kallisto.WT+1))) + geom_point() + geom_smooth(method="lm") + geom_abline(slope=1, intercept = 0, color="red") + annotate("text", x=13, y=20, label= "spearman cor = 0.80") + ggtitle("HTseq TPM versus kallisto TPM")
cor(log2(WT.TPM.HTseq.salmon.kallisto$WT.htseq.TPM+1), log2(WT.TPM.HTseq.salmon.kallisto$kallisto.WT+1), method="spearman")