Last active
December 22, 2015 22:29
-
-
Save Puriney/6539838 to your computer and use it in GitHub Desktop.
Python: Splicing Analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copied From: https://svn.transvar.org/repos/genomes/trunk/pub/src/spliceAnal.py | |
#!/usr/bin/env python | |
""" | |
This file contains code for analyzing novel splicing events from | |
FindJunctions or Tophat junction features. | |
If running this interactively, use the writeJunctions method to create | |
a file that classifies junctions according to whether they are new or | |
novel, internal or external to a known gene, and their frequency of | |
occurence in RNA-Seq data sets. | |
To use this, first run FindJunctions or TopHat to generate junction | |
files for each sample. | |
Next, use BEDTools (http://code.google.com/p/bedtools/) to make | |
required input files. | |
For example, here is how we used BEDTools to make files for analysis of | |
alternative splicing in three junction files from the pollen paper: | |
#!/bin/bash | |
# get TAIR10_mRNA gene models from IGB QL site | |
G=TAIR10_mRNA.bed.gz | |
U=http://www.igbquickload.org/quickload/A_thaliana_Jun_2009 | |
if [ ! -e $G ]; | |
then | |
wget $U/$G | |
fi | |
# convert TAIR10_mRNA to BED12 | |
gunzip -c TAIR10_mRNA.bed.gz | cut -f1-12 > TAIR10_mRNA.bed | |
# create merged gene loci file | |
mergeBed -i TAIR10_mRNA.bed -s -nms > TAIR10_mRNA_loci.bed | |
# get seedling and pollen junctions that overlap with TAIR10 gene models | |
cat TAIR10_mRNA.bed pollen.FJ.bed drought_control_1.FJ.bed drought_control_2.FJ.bed | mergeBed -s -nms > TAIR10_mRNA_loci_pollen_seedling.bed | |
After running this, you will have three files: | |
TAIR10_mRNA_loci.bed - contains coordinates of gene regions created | |
from merging gene models. Field 4 will contains the names of the gene | |
models that were merged. | |
TAIR10_mRNA.bed - contains the gene models, minus fields 13 and 14, | |
which are the degene name and description field. | |
TAIR10_mRNA_loci_pollen_seedling.bed - contains merged gene models and | |
junctions. This the file that let's use map junctions onto genes. A | |
junction feature that extends outside the gene regions defined in | |
TAIR10_mRNA_loci.bed and uniquely maps to the gene probably represents | |
a new 5' or 3' exon and intron. | |
Limitations: | |
Gene models annotated as coming from the same gene should share the same prefix, | |
e.g., | |
AT1G12345.1 and AT1G12345.2 are two annotated alternative transcripts arising | |
from locus AT1G12345. | |
To handle different naming schemes, we can add an option to provide a locus | |
to gene model mapping file. | |
""" | |
import Mapping.FeatureModel as fm | |
import Utils.General as utils | |
import sys,optparse | |
def getIntrons(file): | |
""" | |
Function: Extract introns from the given BED format file. | |
Args : file - name of BED file to read (BED12 or BED14) | |
Returns : dictionary with keys constructed from intron location, strand | |
""" | |
fh = utils.readfile(file) | |
d={} | |
while 1: | |
line = fh.readline() | |
if not line: | |
fh.close() | |
break | |
toks=line.rstrip().split('\t') | |
seq_name = toks[0] | |
chromStart = int(toks[1]) | |
strand=toks[5] | |
gene_model_name=toks[3] | |
# UCSC beds put , at end of fields | |
# TopHat doesn't | |
if toks[10].endswith(','): | |
toks[10]=toks[10][0:-1] | |
if toks[11].endswith(','): | |
toks[11]=toks[11][0:-1] | |
blockSizes=map(lambda x:int(x),toks[10].split(',')) | |
blockStarts=map(lambda x:int(x),toks[11].split(',')) | |
i = 0 | |
for blockStart in blockStarts[:-1]: | |
blockSize=blockSizes[i] | |
intron_start=blockStart+blockSize+chromStart | |
intron_end=blockStarts[i+1]+chromStart | |
key='J:%s:%i-%i:%s'%(seq_name,intron_start,intron_end,strand) | |
if not d.has_key(key): | |
d[key]=[seq_name,intron_start, | |
intron_end,strand,gene_model_name] | |
else: | |
d[key].append(gene_model_name) | |
i = i + 1 | |
# 33,147 | |
# 127,875 introns for TAIR10 mRNA features | |
# 124,567 | |
return d | |
def checkIntrons(d): | |
""" | |
Function: Usually all instances of an intron are from the same locus. | |
Check this. | |
Args : d - output of getIntrons | |
Returns : a list of keys from d that violate this assumption | |
There are 340 annotated in TAIR10 mRNA data set. | |
""" | |
to_return = [] | |
for key in d.keys(): | |
val=d[key] | |
gene_ids={} | |
for gene_id in val[4:]: | |
gene_ids[gene_id.split('.')[0]]=1 | |
if len(gene_ids.keys()) > 1: | |
to_return.append(key) | |
return to_return | |
def makeJuncsFile(fn='TAIR10_mRNA_loci_pollen_seedling.bed', | |
introns=None, | |
junctions=None, | |
out='rnaseq_juncs.txt', | |
genes_file='TAIR10_mRNA_loci.bed'): | |
""" | |
Function: Make a file containing locus and gapped read counts for | |
predicted junctions. Use this in R to assess novel | |
and known junctions and their support. | |
Args : fn - output from BEDTools | |
genes_file - output from BEDTools | |
introns - output from getIntrons | |
junctions - output from getJunctions | |
out - write to stdout or to this file | |
Returns : dictionary where keys are junctions and values contain junction type, | |
number of gapped reads per sample, and locus | |
only junctions that overlap known gene models are reported | |
""" | |
fh = open(genes_file) | |
genes = {} | |
# get locus coordinates | |
while 1: | |
line = fh.readline() | |
if not line: | |
fh.close() | |
break | |
toks=line.rstrip().split('\t') | |
names=toks[3].split(';') | |
locus_names = map(lambda x:x.split('.')[0],names) | |
locus_name = locus_names[0] | |
if len(filter(lambda x:not x == locus_name,locus_names))>0: | |
# skip these pathological cases (203 in TAIR10) | |
continue | |
seqname = toks[0] | |
start = int(toks[1]) | |
end = int(toks[2]) | |
strand = toks[3] | |
genes[locus_name]=[seqname,start,end,strand] | |
# get mapping of loci to genes | |
fh = open(fn) | |
d = {} | |
while 1: | |
line = fh.readline() | |
if not line: | |
fh.close() | |
break | |
toks=line.rstrip().split('\t') | |
names=toks[3].split(';') | |
model_names = filter(lambda x:not x.startswith("J:"),names) | |
junction_names = filter(lambda x:x.startswith("J:"),names) | |
if len(model_names)==0 or len(junction_names)==0: | |
continue | |
locus_names = map(lambda x:x.split('.')[0],model_names) | |
locus_name = locus_names[0] | |
if len(filter(lambda x:not x == locus_name,locus_names))>0: | |
# also skip these pathological cases | |
continue | |
seqname=toks[0] | |
gene_start = genes[locus_name][1] | |
gene_end = genes[locus_name][2] | |
# categorize the junctions according to whether they are internal | |
# to the gene, known (and internal), or new | |
for junction_name in junction_names: | |
toks = junction_name.split(':') | |
(start,end)=map(lambda x:int(x),toks[2].split('-')) | |
if not(start >= gene_start and end <= gene_end): | |
typ="External" | |
elif introns.has_key(junction_name): | |
typ="Known" | |
else: | |
typ = "New" | |
d[junction_name]=[locus_name,typ] | |
# get gapped read counts per sample | |
samples = junctions.keys() | |
for junction_name in d.keys(): | |
for sample in samples: | |
if junctions[sample].has_key(junction_name): | |
score = junctions[sample][junction_name].getVal('score') | |
else: | |
score = 0 | |
sys.stderr.write("Can't find junction: %s\n"%junction_name) | |
d[junction_name].append([sample,score]) | |
if out: | |
fh = open(out,'w') | |
else: | |
fh=sys.stdout | |
heads = ['name','seqname','start','end','strand','locus','type']+samples | |
header = '\t'.join(heads)+'\n' | |
fh.write(header) | |
for junction_name in d.keys(): | |
toks=junction_name.split(':') | |
seqname=toks[1] | |
(start,end)=toks[2].split('-') | |
strand=toks[3] | |
vals = d[junction_name] | |
locus=vals[0] | |
typ = vals[1] | |
line = '\t'.join([junction_name,seqname,start,end,strand,locus,typ]) | |
for pair in vals[2:]: | |
line = line + '\t' + str(pair[1]) | |
fh.write(line+'\n') | |
if out: | |
fh.close() | |
return d # useful if running this interactively | |
def getJunctions(files=None, | |
samples=None): | |
""" | |
Function: Get dictionary containing junction features | |
Args : files - comma-separated string of junction file names (e.g., pollen.FJ.bed) | |
samples - comma-separated string of sample names corresponding to the junction file names | |
Returns : double dictionary, top-level keys are sample names, | |
next-level keys are junction names, values are | |
SeqFeature objects representing the junctions with key "score" | |
equal to number of supporting gapped reads | |
""" | |
files=files.split(',') | |
samples=samples.split(',') | |
d = {} | |
i = 0 | |
for f in files: | |
fh = utils.readfile(f) | |
sample=samples[i] | |
while 1: | |
line = fh.readline() | |
if not line: | |
fh.close() | |
break | |
toks=line.rstrip().split('\t') | |
seq_name = toks[0] | |
gstart = int(toks[1]) | |
gend = int(toks[2]) | |
name=toks[3] | |
score = int(float(toks[4])) | |
strand = toks[5] | |
# UCSC beds put , at end of fields | |
# TopHat doesn't | |
if toks[10].endswith(','): | |
toks[10]=toks[10][0:-1] | |
if toks[11].endswith(','): | |
toks[11]=toks[11][0:-1] | |
blockSizes=map(lambda x:int(x),toks[10].split(',')) | |
blockStarts=map(lambda x:int(x),toks[11].split(',')) | |
start=gstart+blockStarts[0]+blockSizes[0] | |
end=gstart+blockStarts[1] | |
key | |
= 'J:%s:%i-%i:%s'%(seq_name,start,end,strand) | |
if strand == '+': | |
strand = 1 | |
else: | |
strand = -1 | |
junc = fm.SeqFeature(display_id=name, | |
start=start, | |
length=end-start, | |
strand=strand, | |
seqname=seq_name) | |
junc.setKeyVal('score',score) | |
if not d.has_key(sample): | |
d[sample]={} | |
d[sample][key]=junc | |
i = i + 1 | |
return d | |
def main(model_file=None, | |
locus_file=None, | |
all_file=None, | |
junc_files=None, | |
sample_names=None, | |
out_file=None): | |
introns=getIntrons(model_file) | |
junctions=getJunctions(files=junc_files, | |
samples=sample_names) | |
makeJuncsFile(fn=all_file, | |
introns=introns, | |
junctions=junctions, | |
out=out_file, | |
genes_file=locus_file) | |
if __name__ == '__main__': | |
usage = '%prog [options]' | |
parser=optparse.OptionParser(usage) | |
parser.add_option('-m', | |
'--model_file', | |
help="Name of file gene model annotations (e.g., TAIR10_mRNA.bed.gz) [required]", | |
dest="model_file", | |
default=None) | |
parser.add_option('-l','--locus_file', | |
help="Name of file with merged gene models (e.g., TAIR10_mRNA_loci.bed) [required]", | |
dest="locus_file", | |
default=None) | |
parser.add_option('-a','--all_file', | |
help="Name of file with merged junction and gene model features (e.g., TAIR10_mRNA_loci_pollen_seedling.bed) [required]", | |
dest="all_file", | |
default=None) | |
parser.add_option('-j','--junction_files', | |
help="Comma separated list of junction files used to create file with merged junction and gene model features. [required]", | |
dest="junc_files", | |
default=None) | |
parser.add_option('-s','--sample_names', | |
help="Comma separated list of sample names for each junction file. Must be listed in same order as --junction_files [required]", | |
dest="sample_names", | |
default=None) | |
parser.add_option('-o','--out_file', | |
help="Name of file to write. If not provided, writes to stdout. [optional]", | |
dest="out_file", | |
default=None) | |
(options,args)=parser.parse_args() | |
if not (options.model_file and options.locus_file and options.all_file and options.junc_files and options.sample_names): | |
parser.print_help() | |
sys.exit(0) | |
main(model_file=options.model_file,locus_file=options.locus_file, | |
all_file=options.all_file,junc_files=options.junc_files, | |
sample_names=options.sample_names,out_file=options.out_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment