Puriney · December 22, 2015 22:29
diff --git a/spliceAnal.py b/spliceAnal.py
 # Copied From: https://svn.transvar.org/repos/genomes/trunk/pub/src/spliceAnal.py

 #!/usr/bin/env python
 """
 This file contains code for analyzing novel splicing events from
 FindJunctions or Tophat junction features.

 If running this interactively, use the writeJunctions method to create
 a file that classifies junctions according to whether they are new or
 novel, internal or external to a known gene, and their frequency of
 occurence in RNA-Seq data sets.

 To use this, first run FindJunctions or TopHat to generate junction
 files for each sample.

 Next, use BEDTools (http://code.google.com/p/bedtools/) to make
 required input files.

 For example, here is how we used BEDTools to make files for analysis of
 alternative splicing in three junction files from the pollen paper:

 #!/bin/bash
 # get TAIR10_mRNA gene models from IGB QL site
 G=TAIR10_mRNA.bed.gz
 U=http://www.igbquickload.org/quickload/A_thaliana_Jun_2009
 if [ ! -e $G ];
 then 
    wget $U/$G
 fi

 # convert TAIR10_mRNA to BED12
 gunzip -c TAIR10_mRNA.bed.gz | cut -f1-12 > TAIR10_mRNA.bed

 # create merged gene loci file
 mergeBed -i TAIR10_mRNA.bed -s -nms > TAIR10_mRNA_loci.bed

 # get seedling and pollen junctions that overlap with TAIR10 gene models
 cat TAIR10_mRNA.bed pollen.FJ.bed drought_control_1.FJ.bed drought_control_2.FJ.bed | mergeBed -s -nms > TAIR10_mRNA_loci_pollen_seedling.bed

 After running this, you will have three files:

 TAIR10_mRNA_loci.bed - contains coordinates of gene regions created
 from merging gene models. Field 4 will contains the names of the gene
 models that were merged.

 TAIR10_mRNA.bed - contains the gene models, minus fields 13 and 14,
 which are the degene name and description field.

 TAIR10_mRNA_loci_pollen_seedling.bed - contains merged gene models and
 junctions. This the file that let's use map junctions onto genes. A
 junction feature that extends outside the gene regions defined in
 TAIR10_mRNA_loci.bed and uniquely maps to the gene probably represents
 a new 5' or 3' exon and intron.

 Limitations:

 Gene models annotated as coming from the same gene should share the same prefix,
 e.g.,

 AT1G12345.1 and AT1G12345.2 are two annotated alternative transcripts arising
 from locus AT1G12345.

 To handle different naming schemes, we can add an option to provide a locus
 to gene model mapping file. 
 """

 import Mapping.FeatureModel as fm
 import Utils.General as utils
 import sys,optparse


 def getIntrons(file):
    """
    Function: Extract introns from the given BED format file.
    Args    : file - name of BED file to read (BED12 or BED14)
    Returns : dictionary with keys constructed from intron location, strand
    """
    fh = utils.readfile(file)
    d={}
    while 1:
        line = fh.readline()
        if not line:
            fh.close()
            break
        toks=line.rstrip().split('\t')
        seq_name = toks[0]
        chromStart = int(toks[1])
        strand=toks[5]
        gene_model_name=toks[3]
        # UCSC beds put , at end of fields
        # TopHat doesn't
        if toks[10].endswith(','):
            toks[10]=toks[10][0:-1]
        if toks[11].endswith(','):
            toks[11]=toks[11][0:-1]
        blockSizes=map(lambda x:int(x),toks[10].split(','))
        blockStarts=map(lambda x:int(x),toks[11].split(','))
        i = 0
        for blockStart in blockStarts[:-1]:
            blockSize=blockSizes[i]
            intron_start=blockStart+blockSize+chromStart
            intron_end=blockStarts[i+1]+chromStart
            key='J:%s:%i-%i:%s'%(seq_name,intron_start,intron_end,strand)
            if not d.has_key(key):
                d[key]=[seq_name,intron_start,
                        intron_end,strand,gene_model_name]
            else:
                d[key].append(gene_model_name)
            i = i + 1
    # 33,147
    # 127,875 introns for TAIR10 mRNA features
    # 124,567
    return d

 def checkIntrons(d):
    """
    Function: Usually all instances of an intron are from the same locus.
              Check this.
    Args    : d - output of getIntrons
    Returns : a list of keys from d that violate this assumption

    There are 340 annotated in TAIR10 mRNA data set.
    """
    to_return = []
    for key in d.keys():
        val=d[key]
        gene_ids={}
        for gene_id in val[4:]:
            gene_ids[gene_id.split('.')[0]]=1
        if len(gene_ids.keys()) > 1:
            to_return.append(key)
    return to_return


 def makeJuncsFile(fn='TAIR10_mRNA_loci_pollen_seedling.bed',
                  introns=None,
                  junctions=None,
                  out='rnaseq_juncs.txt',
                  genes_file='TAIR10_mRNA_loci.bed'):
    """
    Function: Make a file containing locus and gapped read counts for
              predicted junctions. Use this in R to assess novel
              and known junctions and their support.
    Args    : fn - output from BEDTools
              genes_file - output from BEDTools
              introns - output from getIntrons
              junctions - output from getJunctions
              out - write to stdout or to this file
    Returns : dictionary where keys are junctions and values contain junction type,
              number of gapped reads per sample, and locus
              only junctions that overlap known gene models are reported
    """
    fh = open(genes_file)
    genes = {}
    # get locus coordinates
    while 1:
        line = fh.readline()
        if not line:
            fh.close()
            break
        toks=line.rstrip().split('\t')
        names=toks[3].split(';')
        locus_names = map(lambda x:x.split('.')[0],names)
        locus_name = locus_names[0]
        if len(filter(lambda x:not x == locus_name,locus_names))>0:
            # skip these pathological cases (203 in TAIR10)
            continue
        seqname = toks[0]
        start = int(toks[1])
        end = int(toks[2])
        strand = toks[3]
        genes[locus_name]=[seqname,start,end,strand]
    # get mapping of loci to genes
    fh = open(fn)
    d = {}
    while 1:
        line = fh.readline()
        if not line:
            fh.close()
            break
        toks=line.rstrip().split('\t')
        names=toks[3].split(';')
        model_names = filter(lambda x:not x.startswith("J:"),names)
        junction_names = filter(lambda x:x.startswith("J:"),names)
        if len(model_names)==0 or len(junction_names)==0:
            continue
        locus_names = map(lambda x:x.split('.')[0],model_names)
        locus_name = locus_names[0]
        if len(filter(lambda x:not x == locus_name,locus_names))>0:
            # also skip these pathological cases
            continue
        seqname=toks[0]
        gene_start = genes[locus_name][1]
        gene_end = genes[locus_name][2]
        # categorize the junctions according to whether they are internal
        # to the gene, known (and internal), or new
        for junction_name in junction_names:
            toks = junction_name.split(':')
            (start,end)=map(lambda x:int(x),toks[2].split('-'))
            if not(start >= gene_start and end <= gene_end):
                typ="External"
            elif introns.has_key(junction_name):
                typ="Known"
            else:
                typ = "New"
            d[junction_name]=[locus_name,typ]
    # get gapped read counts per sample
    samples = junctions.keys()
    for junction_name in d.keys():
        for sample in samples:
            if junctions[sample].has_key(junction_name):
                score = junctions[sample][junction_name].getVal('score')
            else:
                score = 0
                sys.stderr.write("Can't find junction: %s\n"%junction_name)
            d[junction_name].append([sample,score])
    if out:
        fh = open(out,'w')
    else:
        fh=sys.stdout
    heads = ['name','seqname','start','end','strand','locus','type']+samples
    header = '\t'.join(heads)+'\n'
    fh.write(header)
    for junction_name in d.keys():
        toks=junction_name.split(':')
        seqname=toks[1]
        (start,end)=toks[2].split('-')
        strand=toks[3]
        vals = d[junction_name]
        locus=vals[0]
        typ = vals[1]
        line = '\t'.join([junction_name,seqname,start,end,strand,locus,typ])
        for pair in vals[2:]:
            line = line + '\t' + str(pair[1])
        fh.write(line+'\n')
    if out:
        fh.close()
    return d # useful if running this interactively

 def getJunctions(files=None,
                 samples=None):
    """
    Function: Get dictionary containing junction features
    Args    : files - comma-separated string of junction file names (e.g., pollen.FJ.bed)
              samples - comma-separated string of sample names corresponding to the junction file names 
    Returns : double dictionary, top-level keys are sample names,
              next-level keys are junction names, values are
              SeqFeature objects representing the junctions with key "score"
              equal to number of supporting gapped reads
    """
    files=files.split(',')
    samples=samples.split(',')
    d = {}
    i = 0
    for f in files:
        fh = utils.readfile(f)
        sample=samples[i]
        while 1:
            line = fh.readline()
            if not line:
                fh.close()
                break
            toks=line.rstrip().split('\t')
            seq_name = toks[0]
            gstart = int(toks[1])
            gend = int(toks[2])
            name=toks[3]
            score = int(float(toks[4])) 
            strand = toks[5]
            # UCSC beds put , at end of fields
            # TopHat doesn't
            if toks[10].endswith(','):
                toks[10]=toks[10][0:-1]
            if toks[11].endswith(','):
                toks[11]=toks[11][0:-1]
            blockSizes=map(lambda x:int(x),toks[10].split(','))
            blockStarts=map(lambda x:int(x),toks[11].split(','))
            start=gstart+blockStarts[0]+blockSizes[0]
            end=gstart+blockStarts[1]
            key
            = 'J:%s:%i-%i:%s'%(seq_name,start,end,strand)
            if strand == '+':
                strand = 1
            else:
                strand = -1
            junc = fm.SeqFeature(display_id=name,
                                start=start,
                                length=end-start,
                                strand=strand,
                                seqname=seq_name)
            junc.setKeyVal('score',score)
            if not d.has_key(sample):
                d[sample]={}
            d[sample][key]=junc
        i = i + 1 
    return d

 def main(model_file=None,
         locus_file=None,
         all_file=None,
         junc_files=None,
         sample_names=None,
         out_file=None):
    introns=getIntrons(model_file)
    junctions=getJunctions(files=junc_files,
                           samples=sample_names)
    makeJuncsFile(fn=all_file,
                  introns=introns,
                  junctions=junctions,
                  out=out_file,
                  genes_file=locus_file)

 if __name__ == '__main__':
    usage = '%prog [options]'
    parser=optparse.OptionParser(usage)
    parser.add_option('-m',
                      '--model_file',
                      help="Name of file gene model annotations (e.g., TAIR10_mRNA.bed.gz) [required]",
                      dest="model_file",
                      default=None)
    parser.add_option('-l','--locus_file',
                      help="Name of file with merged gene models (e.g., TAIR10_mRNA_loci.bed) [required]",
                      dest="locus_file",
                      default=None)
    parser.add_option('-a','--all_file',
                      help="Name of file with merged junction and gene model features (e.g., TAIR10_mRNA_loci_pollen_seedling.bed) [required]",
                      dest="all_file",
                      default=None)
    parser.add_option('-j','--junction_files',
                       help="Comma separated list of junction files used to create  file with merged junction and gene model features. [required]",
                       dest="junc_files",
                       default=None)
    parser.add_option('-s','--sample_names',
                       help="Comma separated list of sample names for each junction file. Must be listed in same order as --junction_files [required]",
                       dest="sample_names",
                       default=None)
    parser.add_option('-o','--out_file',
                      help="Name of file to write. If not provided, writes to stdout. [optional]",
                      dest="out_file",
                      default=None)
    (options,args)=parser.parse_args()
    if not (options.model_file and options.locus_file and options.all_file and options.junc_files and options.sample_names):
       parser.print_help()
       sys.exit(0)
    main(model_file=options.model_file,locus_file=options.locus_file,
         all_file=options.all_file,junc_files=options.junc_files,
         sample_names=options.sample_names,out_file=options.out_file)
	# Copied From: https://svn.transvar.org/repos/genomes/trunk/pub/src/spliceAnal.py

	#!/usr/bin/env python
	"""
	This file contains code for analyzing novel splicing events from
	FindJunctions or Tophat junction features.

	If running this interactively, use the writeJunctions method to create
	a file that classifies junctions according to whether they are new or
	novel, internal or external to a known gene, and their frequency of
	occurence in RNA-Seq data sets.

	To use this, first run FindJunctions or TopHat to generate junction
	files for each sample.

	Next, use BEDTools (http://code.google.com/p/bedtools/) to make
	required input files.

	For example, here is how we used BEDTools to make files for analysis of
	alternative splicing in three junction files from the pollen paper:

	#!/bin/bash
	# get TAIR10_mRNA gene models from IGB QL site
	G=TAIR10_mRNA.bed.gz
	U=http://www.igbquickload.org/quickload/A_thaliana_Jun_2009
	if [ ! -e $G ];
	then
	wget $U/$G
	fi

	# convert TAIR10_mRNA to BED12
	gunzip -c TAIR10_mRNA.bed.gz \| cut -f1-12 > TAIR10_mRNA.bed

	# create merged gene loci file
	mergeBed -i TAIR10_mRNA.bed -s -nms > TAIR10_mRNA_loci.bed

	# get seedling and pollen junctions that overlap with TAIR10 gene models
	cat TAIR10_mRNA.bed pollen.FJ.bed drought_control_1.FJ.bed drought_control_2.FJ.bed \| mergeBed -s -nms > TAIR10_mRNA_loci_pollen_seedling.bed

	After running this, you will have three files:

	TAIR10_mRNA_loci.bed - contains coordinates of gene regions created
	from merging gene models. Field 4 will contains the names of the gene
	models that were merged.

	TAIR10_mRNA.bed - contains the gene models, minus fields 13 and 14,
	which are the degene name and description field.

	TAIR10_mRNA_loci_pollen_seedling.bed - contains merged gene models and
	junctions. This the file that let's use map junctions onto genes. A
	junction feature that extends outside the gene regions defined in
	TAIR10_mRNA_loci.bed and uniquely maps to the gene probably represents
	a new 5' or 3' exon and intron.

	Limitations:

	Gene models annotated as coming from the same gene should share the same prefix,
	e.g.,

	AT1G12345.1 and AT1G12345.2 are two annotated alternative transcripts arising
	from locus AT1G12345.

	To handle different naming schemes, we can add an option to provide a locus
	to gene model mapping file.
	"""

	import Mapping.FeatureModel as fm
	import Utils.General as utils
	import sys,optparse


	def getIntrons(file):
	"""
	Function: Extract introns from the given BED format file.
	Args : file - name of BED file to read (BED12 or BED14)
	Returns : dictionary with keys constructed from intron location, strand
	"""
	fh = utils.readfile(file)
	d={}
	while 1:
	line = fh.readline()
	if not line:
	fh.close()
	break
	toks=line.rstrip().split('\t')
	seq_name = toks[0]
	chromStart = int(toks[1])
	strand=toks[5]
	gene_model_name=toks[3]
	# UCSC beds put , at end of fields
	# TopHat doesn't
	if toks[10].endswith(','):
	toks[10]=toks[10][0:-1]
	if toks[11].endswith(','):
	toks[11]=toks[11][0:-1]
	blockSizes=map(lambda x:int(x),toks[10].split(','))
	blockStarts=map(lambda x:int(x),toks[11].split(','))
	i = 0
	for blockStart in blockStarts[:-1]:
	blockSize=blockSizes[i]
	intron_start=blockStart+blockSize+chromStart
	intron_end=blockStarts[i+1]+chromStart
	key='J:%s:%i-%i:%s'%(seq_name,intron_start,intron_end,strand)
	if not d.has_key(key):
	d[key]=[seq_name,intron_start,
	intron_end,strand,gene_model_name]
	else:
	d[key].append(gene_model_name)
	i = i + 1
	# 33,147
	# 127,875 introns for TAIR10 mRNA features
	# 124,567
	return d

	def checkIntrons(d):
	"""
	Function: Usually all instances of an intron are from the same locus.
	Check this.
	Args : d - output of getIntrons
	Returns : a list of keys from d that violate this assumption

	There are 340 annotated in TAIR10 mRNA data set.
	"""
	to_return = []
	for key in d.keys():
	val=d[key]
	gene_ids={}
	for gene_id in val[4:]:
	gene_ids[gene_id.split('.')[0]]=1
	if len(gene_ids.keys()) > 1:
	to_return.append(key)
	return to_return


	def makeJuncsFile(fn='TAIR10_mRNA_loci_pollen_seedling.bed',
	introns=None,
	junctions=None,
	out='rnaseq_juncs.txt',
	genes_file='TAIR10_mRNA_loci.bed'):
	"""
	Function: Make a file containing locus and gapped read counts for
	predicted junctions. Use this in R to assess novel
	and known junctions and their support.
	Args : fn - output from BEDTools
	genes_file - output from BEDTools
	introns - output from getIntrons
	junctions - output from getJunctions
	out - write to stdout or to this file
	Returns : dictionary where keys are junctions and values contain junction type,
	number of gapped reads per sample, and locus
	only junctions that overlap known gene models are reported
	"""
	fh = open(genes_file)
	genes = {}
	# get locus coordinates
	while 1:
	line = fh.readline()
	if not line:
	fh.close()
	break
	toks=line.rstrip().split('\t')
	names=toks[3].split(';')
	locus_names = map(lambda x:x.split('.')[0],names)
	locus_name = locus_names[0]
	if len(filter(lambda x:not x == locus_name,locus_names))>0:
	# skip these pathological cases (203 in TAIR10)
	continue
	seqname = toks[0]
	start = int(toks[1])
	end = int(toks[2])
	strand = toks[3]
	genes[locus_name]=[seqname,start,end,strand]
	# get mapping of loci to genes
	fh = open(fn)
	d = {}
	while 1:
	line = fh.readline()
	if not line:
	fh.close()
	break
	toks=line.rstrip().split('\t')
	names=toks[3].split(';')
	model_names = filter(lambda x:not x.startswith("J:"),names)
	junction_names = filter(lambda x:x.startswith("J:"),names)
	if len(model_names)==0 or len(junction_names)==0:
	continue
	locus_names = map(lambda x:x.split('.')[0],model_names)
	locus_name = locus_names[0]
	if len(filter(lambda x:not x == locus_name,locus_names))>0:
	# also skip these pathological cases
	continue
	seqname=toks[0]
	gene_start = genes[locus_name][1]
	gene_end = genes[locus_name][2]
	# categorize the junctions according to whether they are internal
	# to the gene, known (and internal), or new
	for junction_name in junction_names:
	toks = junction_name.split(':')
	(start,end)=map(lambda x:int(x),toks[2].split('-'))
	if not(start >= gene_start and end <= gene_end):
	typ="External"
	elif introns.has_key(junction_name):
	typ="Known"
	else:
	typ = "New"
	d[junction_name]=[locus_name,typ]
	# get gapped read counts per sample
	samples = junctions.keys()
	for junction_name in d.keys():
	for sample in samples:
	if junctions[sample].has_key(junction_name):
	score = junctions[sample][junction_name].getVal('score')
	else:
	score = 0
	sys.stderr.write("Can't find junction: %s\n"%junction_name)
	d[junction_name].append([sample,score])
	if out:
	fh = open(out,'w')
	else:
	fh=sys.stdout
	heads = ['name','seqname','start','end','strand','locus','type']+samples
	header = '\t'.join(heads)+'\n'
	fh.write(header)
	for junction_name in d.keys():
	toks=junction_name.split(':')
	seqname=toks[1]
	(start,end)=toks[2].split('-')
	strand=toks[3]
	vals = d[junction_name]
	locus=vals[0]
	typ = vals[1]
	line = '\t'.join([junction_name,seqname,start,end,strand,locus,typ])
	for pair in vals[2:]:
	line = line + '\t' + str(pair[1])
	fh.write(line+'\n')
	if out:
	fh.close()
	return d # useful if running this interactively

	def getJunctions(files=None,
	samples=None):
	"""
	Function: Get dictionary containing junction features
	Args : files - comma-separated string of junction file names (e.g., pollen.FJ.bed)
	samples - comma-separated string of sample names corresponding to the junction file names
	Returns : double dictionary, top-level keys are sample names,
	next-level keys are junction names, values are
	SeqFeature objects representing the junctions with key "score"
	equal to number of supporting gapped reads
	"""
	files=files.split(',')
	samples=samples.split(',')
	d = {}
	i = 0
	for f in files:
	fh = utils.readfile(f)
	sample=samples[i]
	while 1:
	line = fh.readline()
	if not line:
	fh.close()
	break
	toks=line.rstrip().split('\t')
	seq_name = toks[0]
	gstart = int(toks[1])
	gend = int(toks[2])
	name=toks[3]
	score = int(float(toks[4]))
	strand = toks[5]
	# UCSC beds put , at end of fields
	# TopHat doesn't
	if toks[10].endswith(','):
	toks[10]=toks[10][0:-1]
	if toks[11].endswith(','):
	toks[11]=toks[11][0:-1]
	blockSizes=map(lambda x:int(x),toks[10].split(','))
	blockStarts=map(lambda x:int(x),toks[11].split(','))
	start=gstart+blockStarts[0]+blockSizes[0]
	end=gstart+blockStarts[1]
	key
	= 'J:%s:%i-%i:%s'%(seq_name,start,end,strand)
	if strand == '+':
	strand = 1
	else:
	strand = -1
	junc = fm.SeqFeature(display_id=name,
	start=start,
	length=end-start,
	strand=strand,
	seqname=seq_name)
	junc.setKeyVal('score',score)
	if not d.has_key(sample):
	d[sample]={}
	d[sample][key]=junc
	i = i + 1
	return d

	def main(model_file=None,
	locus_file=None,
	all_file=None,
	junc_files=None,
	sample_names=None,
	out_file=None):
	introns=getIntrons(model_file)
	junctions=getJunctions(files=junc_files,
	samples=sample_names)
	makeJuncsFile(fn=all_file,
	introns=introns,
	junctions=junctions,
	out=out_file,
	genes_file=locus_file)

	if __name__ == '__main__':
	usage = '%prog [options]'
	parser=optparse.OptionParser(usage)
	parser.add_option('-m',
	'--model_file',
	help="Name of file gene model annotations (e.g., TAIR10_mRNA.bed.gz) [required]",
	dest="model_file",
	default=None)
	parser.add_option('-l','--locus_file',
	help="Name of file with merged gene models (e.g., TAIR10_mRNA_loci.bed) [required]",
	dest="locus_file",
	default=None)
	parser.add_option('-a','--all_file',
	help="Name of file with merged junction and gene model features (e.g., TAIR10_mRNA_loci_pollen_seedling.bed) [required]",
	dest="all_file",
	default=None)
	parser.add_option('-j','--junction_files',
	help="Comma separated list of junction files used to create file with merged junction and gene model features. [required]",
	dest="junc_files",
	default=None)
	parser.add_option('-s','--sample_names',
	help="Comma separated list of sample names for each junction file. Must be listed in same order as --junction_files [required]",
	dest="sample_names",
	default=None)
	parser.add_option('-o','--out_file',
	help="Name of file to write. If not provided, writes to stdout. [optional]",
	dest="out_file",
	default=None)
	(options,args)=parser.parse_args()
	if not (options.model_file and options.locus_file and options.all_file and options.junc_files and options.sample_names):
	parser.print_help()
	sys.exit(0)
	main(model_file=options.model_file,locus_file=options.locus_file,
	all_file=options.all_file,junc_files=options.junc_files,
	sample_names=options.sample_names,out_file=options.out_file)