Skip to content

Instantly share code, notes, and snippets.

@Puriney
Last active December 22, 2015 22:29
Show Gist options
  • Save Puriney/6539838 to your computer and use it in GitHub Desktop.
Save Puriney/6539838 to your computer and use it in GitHub Desktop.
Python: Splicing Analysis
# Copied From: https://svn.transvar.org/repos/genomes/trunk/pub/src/spliceAnal.py
#!/usr/bin/env python
"""
This file contains code for analyzing novel splicing events from
FindJunctions or Tophat junction features.
If running this interactively, use the writeJunctions method to create
a file that classifies junctions according to whether they are new or
novel, internal or external to a known gene, and their frequency of
occurence in RNA-Seq data sets.
To use this, first run FindJunctions or TopHat to generate junction
files for each sample.
Next, use BEDTools (http://code.google.com/p/bedtools/) to make
required input files.
For example, here is how we used BEDTools to make files for analysis of
alternative splicing in three junction files from the pollen paper:
#!/bin/bash
# get TAIR10_mRNA gene models from IGB QL site
G=TAIR10_mRNA.bed.gz
U=http://www.igbquickload.org/quickload/A_thaliana_Jun_2009
if [ ! -e $G ];
then
wget $U/$G
fi
# convert TAIR10_mRNA to BED12
gunzip -c TAIR10_mRNA.bed.gz | cut -f1-12 > TAIR10_mRNA.bed
# create merged gene loci file
mergeBed -i TAIR10_mRNA.bed -s -nms > TAIR10_mRNA_loci.bed
# get seedling and pollen junctions that overlap with TAIR10 gene models
cat TAIR10_mRNA.bed pollen.FJ.bed drought_control_1.FJ.bed drought_control_2.FJ.bed | mergeBed -s -nms > TAIR10_mRNA_loci_pollen_seedling.bed
After running this, you will have three files:
TAIR10_mRNA_loci.bed - contains coordinates of gene regions created
from merging gene models. Field 4 will contains the names of the gene
models that were merged.
TAIR10_mRNA.bed - contains the gene models, minus fields 13 and 14,
which are the degene name and description field.
TAIR10_mRNA_loci_pollen_seedling.bed - contains merged gene models and
junctions. This the file that let's use map junctions onto genes. A
junction feature that extends outside the gene regions defined in
TAIR10_mRNA_loci.bed and uniquely maps to the gene probably represents
a new 5' or 3' exon and intron.
Limitations:
Gene models annotated as coming from the same gene should share the same prefix,
e.g.,
AT1G12345.1 and AT1G12345.2 are two annotated alternative transcripts arising
from locus AT1G12345.
To handle different naming schemes, we can add an option to provide a locus
to gene model mapping file.
"""
import Mapping.FeatureModel as fm
import Utils.General as utils
import sys,optparse
def getIntrons(file):
"""
Function: Extract introns from the given BED format file.
Args : file - name of BED file to read (BED12 or BED14)
Returns : dictionary with keys constructed from intron location, strand
"""
fh = utils.readfile(file)
d={}
while 1:
line = fh.readline()
if not line:
fh.close()
break
toks=line.rstrip().split('\t')
seq_name = toks[0]
chromStart = int(toks[1])
strand=toks[5]
gene_model_name=toks[3]
# UCSC beds put , at end of fields
# TopHat doesn't
if toks[10].endswith(','):
toks[10]=toks[10][0:-1]
if toks[11].endswith(','):
toks[11]=toks[11][0:-1]
blockSizes=map(lambda x:int(x),toks[10].split(','))
blockStarts=map(lambda x:int(x),toks[11].split(','))
i = 0
for blockStart in blockStarts[:-1]:
blockSize=blockSizes[i]
intron_start=blockStart+blockSize+chromStart
intron_end=blockStarts[i+1]+chromStart
key='J:%s:%i-%i:%s'%(seq_name,intron_start,intron_end,strand)
if not d.has_key(key):
d[key]=[seq_name,intron_start,
intron_end,strand,gene_model_name]
else:
d[key].append(gene_model_name)
i = i + 1
# 33,147
# 127,875 introns for TAIR10 mRNA features
# 124,567
return d
def checkIntrons(d):
"""
Function: Usually all instances of an intron are from the same locus.
Check this.
Args : d - output of getIntrons
Returns : a list of keys from d that violate this assumption
There are 340 annotated in TAIR10 mRNA data set.
"""
to_return = []
for key in d.keys():
val=d[key]
gene_ids={}
for gene_id in val[4:]:
gene_ids[gene_id.split('.')[0]]=1
if len(gene_ids.keys()) > 1:
to_return.append(key)
return to_return
def makeJuncsFile(fn='TAIR10_mRNA_loci_pollen_seedling.bed',
introns=None,
junctions=None,
out='rnaseq_juncs.txt',
genes_file='TAIR10_mRNA_loci.bed'):
"""
Function: Make a file containing locus and gapped read counts for
predicted junctions. Use this in R to assess novel
and known junctions and their support.
Args : fn - output from BEDTools
genes_file - output from BEDTools
introns - output from getIntrons
junctions - output from getJunctions
out - write to stdout or to this file
Returns : dictionary where keys are junctions and values contain junction type,
number of gapped reads per sample, and locus
only junctions that overlap known gene models are reported
"""
fh = open(genes_file)
genes = {}
# get locus coordinates
while 1:
line = fh.readline()
if not line:
fh.close()
break
toks=line.rstrip().split('\t')
names=toks[3].split(';')
locus_names = map(lambda x:x.split('.')[0],names)
locus_name = locus_names[0]
if len(filter(lambda x:not x == locus_name,locus_names))>0:
# skip these pathological cases (203 in TAIR10)
continue
seqname = toks[0]
start = int(toks[1])
end = int(toks[2])
strand = toks[3]
genes[locus_name]=[seqname,start,end,strand]
# get mapping of loci to genes
fh = open(fn)
d = {}
while 1:
line = fh.readline()
if not line:
fh.close()
break
toks=line.rstrip().split('\t')
names=toks[3].split(';')
model_names = filter(lambda x:not x.startswith("J:"),names)
junction_names = filter(lambda x:x.startswith("J:"),names)
if len(model_names)==0 or len(junction_names)==0:
continue
locus_names = map(lambda x:x.split('.')[0],model_names)
locus_name = locus_names[0]
if len(filter(lambda x:not x == locus_name,locus_names))>0:
# also skip these pathological cases
continue
seqname=toks[0]
gene_start = genes[locus_name][1]
gene_end = genes[locus_name][2]
# categorize the junctions according to whether they are internal
# to the gene, known (and internal), or new
for junction_name in junction_names:
toks = junction_name.split(':')
(start,end)=map(lambda x:int(x),toks[2].split('-'))
if not(start >= gene_start and end <= gene_end):
typ="External"
elif introns.has_key(junction_name):
typ="Known"
else:
typ = "New"
d[junction_name]=[locus_name,typ]
# get gapped read counts per sample
samples = junctions.keys()
for junction_name in d.keys():
for sample in samples:
if junctions[sample].has_key(junction_name):
score = junctions[sample][junction_name].getVal('score')
else:
score = 0
sys.stderr.write("Can't find junction: %s\n"%junction_name)
d[junction_name].append([sample,score])
if out:
fh = open(out,'w')
else:
fh=sys.stdout
heads = ['name','seqname','start','end','strand','locus','type']+samples
header = '\t'.join(heads)+'\n'
fh.write(header)
for junction_name in d.keys():
toks=junction_name.split(':')
seqname=toks[1]
(start,end)=toks[2].split('-')
strand=toks[3]
vals = d[junction_name]
locus=vals[0]
typ = vals[1]
line = '\t'.join([junction_name,seqname,start,end,strand,locus,typ])
for pair in vals[2:]:
line = line + '\t' + str(pair[1])
fh.write(line+'\n')
if out:
fh.close()
return d # useful if running this interactively
def getJunctions(files=None,
samples=None):
"""
Function: Get dictionary containing junction features
Args : files - comma-separated string of junction file names (e.g., pollen.FJ.bed)
samples - comma-separated string of sample names corresponding to the junction file names
Returns : double dictionary, top-level keys are sample names,
next-level keys are junction names, values are
SeqFeature objects representing the junctions with key "score"
equal to number of supporting gapped reads
"""
files=files.split(',')
samples=samples.split(',')
d = {}
i = 0
for f in files:
fh = utils.readfile(f)
sample=samples[i]
while 1:
line = fh.readline()
if not line:
fh.close()
break
toks=line.rstrip().split('\t')
seq_name = toks[0]
gstart = int(toks[1])
gend = int(toks[2])
name=toks[3]
score = int(float(toks[4]))
strand = toks[5]
# UCSC beds put , at end of fields
# TopHat doesn't
if toks[10].endswith(','):
toks[10]=toks[10][0:-1]
if toks[11].endswith(','):
toks[11]=toks[11][0:-1]
blockSizes=map(lambda x:int(x),toks[10].split(','))
blockStarts=map(lambda x:int(x),toks[11].split(','))
start=gstart+blockStarts[0]+blockSizes[0]
end=gstart+blockStarts[1]
key
= 'J:%s:%i-%i:%s'%(seq_name,start,end,strand)
if strand == '+':
strand = 1
else:
strand = -1
junc = fm.SeqFeature(display_id=name,
start=start,
length=end-start,
strand=strand,
seqname=seq_name)
junc.setKeyVal('score',score)
if not d.has_key(sample):
d[sample]={}
d[sample][key]=junc
i = i + 1
return d
def main(model_file=None,
locus_file=None,
all_file=None,
junc_files=None,
sample_names=None,
out_file=None):
introns=getIntrons(model_file)
junctions=getJunctions(files=junc_files,
samples=sample_names)
makeJuncsFile(fn=all_file,
introns=introns,
junctions=junctions,
out=out_file,
genes_file=locus_file)
if __name__ == '__main__':
usage = '%prog [options]'
parser=optparse.OptionParser(usage)
parser.add_option('-m',
'--model_file',
help="Name of file gene model annotations (e.g., TAIR10_mRNA.bed.gz) [required]",
dest="model_file",
default=None)
parser.add_option('-l','--locus_file',
help="Name of file with merged gene models (e.g., TAIR10_mRNA_loci.bed) [required]",
dest="locus_file",
default=None)
parser.add_option('-a','--all_file',
help="Name of file with merged junction and gene model features (e.g., TAIR10_mRNA_loci_pollen_seedling.bed) [required]",
dest="all_file",
default=None)
parser.add_option('-j','--junction_files',
help="Comma separated list of junction files used to create file with merged junction and gene model features. [required]",
dest="junc_files",
default=None)
parser.add_option('-s','--sample_names',
help="Comma separated list of sample names for each junction file. Must be listed in same order as --junction_files [required]",
dest="sample_names",
default=None)
parser.add_option('-o','--out_file',
help="Name of file to write. If not provided, writes to stdout. [optional]",
dest="out_file",
default=None)
(options,args)=parser.parse_args()
if not (options.model_file and options.locus_file and options.all_file and options.junc_files and options.sample_names):
parser.print_help()
sys.exit(0)
main(model_file=options.model_file,locus_file=options.locus_file,
all_file=options.all_file,junc_files=options.junc_files,
sample_names=options.sample_names,out_file=options.out_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment