Created
July 31, 2017 10:39
-
-
Save kaizhao86/20ab35d3c8dcf0b08e9b45169d325ce6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import Bio.SeqIO,gzip,fnmatch,itertools,math,numpy,os,re,sqlite3,subprocess,sys,tarfile,xlsxwriter | |
| """ | |
| This program reads in our list of known mouse enhancers (from publications), Gli3-binding sites, | |
| and H3k27ac marks. For the k27ac marks, we merge adjacent CHIP-seq binding positions | |
| """ | |
| conn = sqlite3.connect("limb-nouveau.sqlite3") | |
| #Connect to local sqlite3 database | |
| c=conn.cursor() | |
| PATH = "/home/zhaok/Data/20140702_limb/" | |
| ############SET UP DATABASES AND FILES TO LOAD############################# | |
| #Delete existing tables so we can re-write the information | |
| #'mouseGeneRegions' contains locations for each mouse gene of interest | |
| #'Enhancers' table contains a record for each known or putative mouse enhancer (incl k27ac marks) | |
| #For each instance a mouse enhancer is within 100kb up- or down-stream of a gene of interest | |
| #there is an entry in the 'enhancersByGene' table, along with relative & absolute potions | |
| c.execute("DROP TABLE IF EXISTS mouseGeneRegions") | |
| c.execute("CREATE TABLE mouseGeneRegions(id INTEGER PRIMARY KEY, geneNames, chrom INTEGER, "+ | |
| "geneStart INTEGER, geneEnd INTEGER)") | |
| c.execute("DROP TABLE IF EXISTS enhancers") | |
| c.execute("CREATE TABLE enhancers(id INTEGER PRIMARY KEY, chrom INTEGER, enStart INTEGER, "+ | |
| "enEnd INTEGER, phastcons FLOAT,stage,enhancerType,exonCoverage FLOAT)") | |
| c.execute("DROP TABLE IF EXISTS enhancersByGene") | |
| c.execute("CREATE TABLE enhancersByGene(id INTEGER PRIMARY KEY, enhancerID INTEGER,"+ | |
| "mouseGeneID INTEGER,geneNames,relLoc,absLoc,dist Integer)") | |
| #These three tables are used to generate homologous putative enhancer regions | |
| c.execute("DROP TABLE IF EXISTS homologRegions") | |
| c.execute("CREATE TABLE homologRegions(id INTEGER PRIMARY KEY,enhancerid,mafsid,speciesbuild,"+ | |
| "contig,start INTEGER,end INTEGER,mouseContig,mouseStart INTEGER,mouseEnd INTEGER,score FLOAT,size INTEGER,matchDir)") | |
| c.execute("DROP TABLE IF EXISTS mergedHomologs") | |
| c.execute("CREATE TABLE mergedHomologs(id INTEGER PRIMARY KEY,enhancerid,hregionid5 INTEGER,hregionid3 INTEGER,speciesbuild,contig,start INTEGER,end INTEGER,largestDiff INTEGER,seq)") | |
| c.execute("DROP TABLE IF EXISTS mafs") | |
| c.execute("CREATE TABLE mafs(id INTEGER PRIMARY KEY, enhancerid INTEGER, maf)") | |
| ############################################################### | |
| ########################################################################### | |
| #ENHANCERS! | |
| ###########Find putative enhancer marks within 100kb of a gene of interest###################### | |
| ###Import gene-of-interest (for mouse) list from tab-delimited file | |
| for line in open("genesOfInterestMerged.tab","rU"): | |
| (genename,chrom,geneStart,geneEnd)=line.rstrip().split("\t") | |
| c.execute("INSERT INTO mouseGeneRegions(geneNames, chrom, geneStart, geneEnd)VALUES(?,?,?,?)", | |
| (genename,chrom,geneStart,geneEnd)) | |
| ###IMPORT GLI3 marks from Vokes et al. paper | |
| enhancerType="Gli3"; stage=2 | |
| firstLine = True | |
| for line in open("enhancers/vokesGliData"): | |
| if firstLine: #ignore header line | |
| firstLine=False;continue | |
| bsl=line.rstrip().split("\t") | |
| try: | |
| bchrom=int(bsl[1][3:]);bstart=int(bsl[2]);bend=int(bsl[3]) | |
| c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,stage,enhancerType)VALUES(?,?,?,?,?)", | |
| (chrom,bstart,bend,stage,"gli3")) | |
| except(ValueError): #Ignore chromosome Y and MT (value error because they're not integers) | |
| #end we're not interested in them anyway | |
| pass | |
| ###Import list of known enhancers (for mouse) | |
| for line in open("knownenhancers.tab","rU"): | |
| sl=line.rstrip().split("\t") #no header in this file | |
| (name,chrom,enStart,enEnd,ref) = sl | |
| c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,enhancerType)VALUES(?,?,?,?)", | |
| (int(chrom[3:]),enStart,enEnd,"knownEnhancer_"+name)) | |
| #Keeps only chromesome number, casts as float. | |
| conn.commit() #force commit into database | |
| ########################################################################################## | |
| ####Import h3k27ac marks into temporary table | |
| #Temporary table will be formatted similarly to the bed files | |
| #Then we'll scan the table to merge nearby marks | |
| c.execute("DROP TABLE IF EXISTS acMarks") | |
| c.execute("CREATE TEMPORARY TABLE acMarks(id INTEGER PRIMARY KEY, chrom INTEGER,enStart INTEGER, enEnd INTEGER,stage)") | |
| stages = [[2,"enhancers/GSE42413_e10.5_3_h3k27acEnrichedRegions0.00001.bed"], | |
| [3,"enhancers/GSE42413_e11.5_2_h3k27acEnrichedRegions0.00001.bed"]] | |
| for (stage,bedfile) in stages: #read the two bed files, for stage_2 and stage_3 of limb dev. | |
| for line in open(bedfile): | |
| bsl=line.rstrip().split("\t") | |
| try: | |
| bchrom=int(bsl[0][3:]);bstart=int(bsl[1]);bend=int(bsl[2]) | |
| c.execute("INSERT INTO acMarks(chrom,enStart,enEnd,stage)VALUES(?,?,?,?)",(bchrom,bstart,bend,stage)) | |
| except(ValueError): | |
| pass | |
| conn.commit() | |
| #Index the temporary ac mark table | |
| c.execute("DROP INDEX IF EXISTS iacm1");c.execute("CREATE INDEX iacm1 ON acMarks(chrom,enStart,enEnd,stage)") | |
| #iterate through the temporary table from the bed file | |
| #to merge adjacent H3k27ac marks | |
| halfWindowSize=25;windowSize=50 | |
| #iterate by chromosome | |
| for (chrom,) in c.execute("SELECT DISTINCT chrom FROM acMarks").fetchall(): | |
| #we make an empty numpy array, where each element represents a 25bp (half window) | |
| #to do that, we get the last bp for this chromosome | |
| #keep stage 2 and stage 3 marks in the same array | |
| #so we can determine overlaps for stage 2 and stage 3 | |
| highestBp = c.execute("SELECT max(enEnd) FROM acMarks WHERE chrom=?",(chrom,)).fetchone()[0] | |
| marks=numpy.zeros(math.ceil((highestBp+1)/halfWindowSize)) | |
| for (enStart,enEnd) in c.execute("SELECT enStart,enEnd FROM acMarks WHERE chrom=? AND stage=2",(chrom,)).fetchall(): | |
| xStart=int(math.floor(enStart/25));xEnd=int(math.ceil(enEnd/25)) | |
| #xStart and xEnd are array-translated position indices | |
| marks[xStart:xEnd]=[2]*(xEnd-xStart) #write "2" in the array at indices corresponding to the start | |
| #and stop position for this acetylation mark | |
| for (enStart,enEnd) in c.execute("SELECT enStart,enEnd FROM acMarks WHERE chrom=? AND stage=3",(chrom,)).fetchall(): | |
| xStart=int(math.floor(enStart/25));xEnd=int(math.ceil(enEnd/25)) | |
| #xStart and xEnd are array-translated position indices | |
| marks[xStart:xEnd]=marks[xStart:xEnd]+[3]*(xEnd-xStart) #add "3" to the elements that represented this segment | |
| #look for runs of acetylation marks | |
| nextRunStart=0 | |
| lmarks = list(int(i) for i in marks) | |
| #iterate through the chromosome to scan start and ends of runs | |
| while True: | |
| for i in xrange(nextRunStart,len(marks)): #ac mark start scanning | |
| #if we find an acetylation mark, we note where it starts | |
| if lmarks[i]!=0: | |
| runStart=i | |
| runValue=lmarks[i] | |
| break #found a mark, now it's time to look it's end | |
| #scan for the end of the acetylation mark in this run | |
| for i in xrange(runStart+1,len(marks)): | |
| if lmarks[i]!=runValue: | |
| runEnd=i-1 | |
| break #found the end of the mark! | |
| convertedRunStart=25*(runStart-1);convertedRunEnd=25*(runEnd+1) | |
| #change the array indices back to bp positions | |
| #then insert into db | |
| c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,stage,enhancerType)VALUES(?,?,?,?,?)",(chrom,convertedRunStart,convertedRunEnd,"2_3","H3k27ac_type"+str(runValue))) | |
| nextRunStart=runEnd+1 #start scanning for next run of ac marks after the end of this mark | |
| ####Now merged into enhancersByGene######### | |
| #Do a cross join of the mouseGeneRegions and enhancers tables | |
| #Find enhancers within 100kb of each mouse gene | |
| for (enhancersId,mouseGeneId,geneStart,geneEnd,geneNames,enStart,enEnd) in c.execute("SELECT"+ | |
| " enhancers.id,mg.id,geneStart,geneEnd,geneNames,enStart,enEnd FROM mouseGeneRegions as mg,"+ | |
| "enhancers WHERE mg.chrom = enhancers.chrom AND "+ | |
| "(abs(enStart-geneStart)<100000 or abs(enEnd-geneEnd)<100000)").fetchall(): | |
| if enEnd<geneStart: | |
| relLoc="Upstream";absLoc="5'";dist=geneStart-enEnd | |
| elif geneEnd<enStart: | |
| relLoc="Downstream";absLoc="3'";dist=enStart-geneEnd | |
| else: | |
| relLoc="Overlap with gene of interest";absLoc="";dist=0 | |
| #insert the enhancer-gene pair into the db | |
| c.execute("INSERT INTO enhancersByGene(enhancerID,mouseGeneID,geneNames,relLoc,absLoc,dist)"+ | |
| "VALUES(?,?,?,?,?,?)",(enhancersId,mouseGeneId,geneNames,relLoc,absLoc,dist)) | |
| #This crazy CREATE TABLE by SELECT statement creates a table that contains only the enhancers near genes | |
| c.execute("DROP TABLE IF EXISTS enhancersOfInterest") | |
| c.execute("CREATE TABLE enhancersOfInterest AS SELECT enhancers.id as enhancerID,"+ | |
| "chrom,enStart,enEnd FROM enhancers WHERE enhancers.id IN (SELECT enhancerID FROM enhancersByGene)") | |
| c.execute("ALTER TABLE enhancersOfInterest ADD phastcons") #add conservation & exonCoverage fields to the table | |
| c.execute("ALTER TABLE enhancersOfInterest ADD exonCoverage") | |
| ###############Phastcons#################################################### | |
| #Write our enhancers of interest into a bed file, so we can use UCSC's | |
| #bigWigAverageOverBed program to get phastcons averages for the enhancers of interest | |
| #their program is faster than anything we could write in python | |
| bedoutfh=open("temp.bed",'w') | |
| for (id,chrom,enStart,enEnd) in c.execute("SELECT DISTINCT enhancerID,chrom,enStart,enEnd FROM enhancersOfInterest").fetchall(): | |
| bedoutfh.write("chr%i\t%i\t%i\t%i\n"%(chrom,enStart,enEnd,id)) | |
| bedoutfh.close() | |
| #run the external bigWigAverageOverBed file using the phastcons track | |
| rv=subprocess.call(["phastcons/bigWigAverageOverBed","phastcons/mm10.60way.phastCons.bw","temp.bed","pcOut.tab"]) | |
| if rv!=0: | |
| raise Exception("Phastcon run issue") | |
| #now read in the enhancersOfInterest table now with phastcons averages and update | |
| #the table with the phastcons scores | |
| for line in open("phastcons/pcOut.tab","rU"): | |
| (id,size,covered,sumVal,mean0,mean)= line.rstrip().split("\t") | |
| c.execute("UPDATE enhancersOfInterest SET phastcons = ? WHERE enhancerID = ?",(mean,id)) | |
| ################################################################################### | |
| conn.commit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment