Last active
January 30, 2024 16:31
-
-
Save willtownes/ebb87e7b224dd206a3d2 to your computer and use it in GitHub Desktop.
Splits a refGene.txt file into multiple .bed files for various genome features (exon,intron, etc), suitable for input to bedtools coverage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Python Script to read in a reference genome of refseq IDs and output several tab-delimited BED (text) files suitable for use with bedtools coverage for counting ChIP-seq reads that map to various gene features. | |
All output files have the structure expected by bedtools, namely, | |
CHROM POSITION1 POSITION2 REFSEQ_ID | |
Possible output files include: | |
1. distal promoter (transcription start [-5KB,-1KB]) KB means kilobase pairs, not kilobyte | |
2. proximal promoter (transcription start [-1KB,1KB]) | |
3. gene body (anywhere between transcription start and transcription end) | |
4. transcript (anywhere in an exon)- outputs each exon as a separate line | |
5. first 1/3 transcript- outputs each exon as a separate line | |
6. middle 1/3 transcript- outputs each exon as a separate line | |
7. last 1/3 transcript- outputs each exon as a separate line | |
8. introns- outputs each intron as a separate line | |
""" | |
def splitExons(exons): | |
'''given a list of tuples [(a,b),(c,d)...] representing intervals of exon locations, return a dictionary of tuple lists with the exons split out among first, mid, and last thirds of the overall transcript length. Exons that straddle the split point(s), will be split into separate exon intervals. Returns a dictionary of form: | |
{"first":[(a,b),...],"mid":[(c,d),...],"last":[(e,f),...]}''' | |
l = [i[1]-i[0] for i in exons] #all values should be positive! | |
length = sum(l) | |
Q = length/3 #boundary between first and mid | |
R = 2*length/3 #boundary between mid and high | |
first = [] | |
mid = [] | |
last = [] | |
acc = 0 #accumulated length | |
for i in xrange(len(exons)): | |
acc_new = acc + l[i] | |
if acc_new <= Q: #entirely in the first third | |
first.append(exons[i]) | |
elif Q<acc_new<=R and acc >Q: #entirely in the middle third | |
mid.append(exons[i]) | |
elif acc_new>R and acc>R: #entirely in the last third | |
last.append(exons[i]) | |
elif Q<acc_new<=R and acc<=Q: #straddling first and mid thirds | |
cut_loc = exons[i][0]+Q-acc | |
first.append((exons[i][0],cut_loc)) | |
mid.append((cut_loc,exons[i][1])) | |
elif acc_new>R and Q<acc<=R: #straddling mid and last thirds | |
cut_loc = exons[i][0]+R-acc | |
mid.append((exons[i][0],cut_loc)) | |
last.append((cut_loc,exons[i][1])) | |
elif acc_new>R and acc<=Q: #straddling all | |
cut_loc_1 = exons[i][0]+Q-acc | |
cut_loc_2 = exons[i][1]-(acc_new-R) | |
first.append((exons[i][0],cut_loc_1)) | |
mid.append((cut_loc_1,cut_loc_2)) | |
last.append((cut_loc_2,exons[i][1])) | |
else: raise Exception("Unable to process exon %d: %s"%(i,exons[i])) | |
acc = acc_new | |
return {"first":first,"mid":mid,"last":last} | |
def refgene_stream(genome_filename="refGene.txt"): | |
"""Given a refseq genome file (for example download from: | |
http://hgdownload.soe.ucsc.edu/downloads.html | |
streams out dicts of information about each line of the file in following format: | |
"chrom":chromosome number | |
"refseq_id":refseq ID | |
"txstart":transcription start site | |
"txend": transcription end site | |
"exons": list of (start,end) tuples for all exons in the refseq_id/gene | |
"introns": list of (start,end) tuples for all introns in the refseq_id/gene, should be one less than the number of exons. | |
No uniqueness checks are performed. | |
""" | |
with open(genome_filename,mode="r") as genome_file: | |
for line in genome_file: | |
vals=line.split('\t') | |
#print("len(vals)=",len(vals)) #debugging | |
exonstarts = [int(i) for i in vals[9][:-1].split(",")] #:-1 to get rid of trailing comma | |
exonends = [int(i) for i in vals[10][:-1].split(",")] | |
exons = zip(exonstarts,exonends) | |
exon_bins = splitExons(exons) | |
introns = [(exons[i-1][1],exons[i][0]) for i in xrange(1,len(exons))] | |
yield {"chrom":vals[2],"refseq_id":vals[1],"txstart":int(vals[4]),"txend":int(vals[5]),"introns":introns,"exons":exons,"exon_bins":exon_bins} | |
def distalPromoter(g): | |
'''given a gene/refseq dictionary emitted by refgene_stream, output info about the distal promoter as a tuple''' | |
tx = g["txstart"] | |
#note this can return an interval of form (0,0) if the gene is very close to the end of the chromosome! | |
yield (g["chrom"],str(max(0,tx-5000)),str(max(0,tx-1000)),g["refseq_id"]) | |
def proximalPromoter(g): | |
'''given a gene/refseq dictionary emitted by refgene_stream, output info about the proximal promoter as a tuple''' | |
tx = g["txstart"] | |
yield (g["chrom"],str(max(0,tx-1000)),str(tx+1000),g["refseq_id"]) | |
def geneBody(g): | |
'''given a gene/refseq dictionary emitted by refgene_stream, output info about the gene body as a tuple''' | |
yield (g["chrom"],str(g["txstart"]),str(g["txend"]),g["refseq_id"]) | |
def exonStream(g,subset="all"): | |
'''given a gene/refseq dictionary emitted by refgene_stream, stream out info about each exon. | |
subset argument can take on several values (case sensitive!) | |
subset="all": emit all exons | |
subset="first": emit all exons in the first third of the transcript | |
subset="mid": emit all exons in the middle third of the transcript | |
subset="last": emit all exons in the last third of the transcript. | |
For options "first","mid","last", if an exon lies on the first/mid or mid/last boundary, the exon is partitioned into two exons and assigned to the separate boundaries''' | |
#if subset not in ("all","first","mid","last"): | |
# raise ValueError('Invalid Subset, correct values include "all","first","mid","last"') | |
if subset=="all": | |
exons = g["exons"] | |
else: | |
exons = g["exon_bins"][subset] | |
for i in xrange(len(exons)): | |
yield (g["chrom"],str(exons[i][0]),str(exons[i][1]),g["refseq_id"])#+"_exon"+str(i)) | |
def exonsAll(g): | |
return exonStream(g,subset="all") | |
def exonsFirst(g): | |
return exonStream(g,subset="first") | |
def exonsMid(g): | |
return exonStream(g,subset="mid") | |
def exonsLast(g): | |
return exonStream(g,subset="last") | |
def introns(g): | |
'''given a gene/refseq dictionary emitted by refgene_stream, stream out info about each intron.''' | |
introns = g["introns"] | |
for i in xrange(len(introns)): | |
yield (g["chrom"],str(introns[i][0]),str(introns[i][1]),g["refseq_id"])#+"_intron"+str(i)) | |
def genome2bed(genome,ofilename,func=geneBody): | |
"""write out BED (txt, tab delim) files for a feature (func is a function to stream out the features desired). | |
genome can be a stream (for example from parsing refGene.txt via refgene_stream().""" | |
with open(ofilename,"wb") as ofile: | |
for g in genome: | |
for res in func(g): | |
ofile.write("\t".join(res)+"\n") | |
if __name__=="__main__": | |
#genome = list(refgene_stream("refGene_test.txt")) | |
#assert len(genome[0]["introns"])==len(genome[0]["exons"])-1 | |
genome = list(refgene_stream("refGene.txt")) #big object, but shouldn't take more than a few seconds to load | |
methods = [distalPromoter,proximalPromoter,geneBody,exonsAll,exonsFirst,exonsMid,exonsLast,introns] | |
ofiles = ["refGene%d.txt"%i for i in xrange(1,9)] | |
with open("file_index_refGene.txt","w") as out: | |
for i in xrange(8): | |
out.write(ofiles[i]+"\t"+methods[i].func_name+"\n") | |
genome2bed(genome,ofiles[i],methods[i]) | |
# refGene1.txt distalPromoter | |
# refGene2.txt proximalPromoter | |
# refGene3.txt geneBody | |
# refGene4.txt exonsAll | |
# refGene5.txt exonsFirst | |
# refGene6.txt exonsMid | |
# refGene7.txt exonsLast | |
# refGene8.txt introns |
Thanks for the note. I wrote this so long ago I don't even remember what it was for (maybe a homework assignment?). Future users should consider the gist deprecated. If you have a correct implementation please feel free to link to it so others won't be led astray by my mistakes.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I don't think this code is correct. The refGene format supplies the left and right coordinates of transcription, but the transcription start site depends on the strand (column 3). This script always sets the promoter around the left end of transcription, not necessarily the transcription start site.