Skip to content

Instantly share code, notes, and snippets.

@kaizhao86
Created July 31, 2017 10:39
Show Gist options
  • Select an option

  • Save kaizhao86/20ab35d3c8dcf0b08e9b45169d325ce6 to your computer and use it in GitHub Desktop.

Select an option

Save kaizhao86/20ab35d3c8dcf0b08e9b45169d325ce6 to your computer and use it in GitHub Desktop.
import Bio.SeqIO,gzip,fnmatch,itertools,math,numpy,os,re,sqlite3,subprocess,sys,tarfile,xlsxwriter
"""
This program reads in our list of known mouse enhancers (from publications), Gli3-binding sites,
and H3k27ac marks. For the k27ac marks, we merge adjacent CHIP-seq binding positions
"""
conn = sqlite3.connect("limb-nouveau.sqlite3")
#Connect to local sqlite3 database
c=conn.cursor()
PATH = "/home/zhaok/Data/20140702_limb/"
############SET UP DATABASES AND FILES TO LOAD#############################
#Delete existing tables so we can re-write the information
#'mouseGeneRegions' contains locations for each mouse gene of interest
#'Enhancers' table contains a record for each known or putative mouse enhancer (incl k27ac marks)
#For each instance a mouse enhancer is within 100kb up- or down-stream of a gene of interest
#there is an entry in the 'enhancersByGene' table, along with relative & absolute potions
c.execute("DROP TABLE IF EXISTS mouseGeneRegions")
c.execute("CREATE TABLE mouseGeneRegions(id INTEGER PRIMARY KEY, geneNames, chrom INTEGER, "+
"geneStart INTEGER, geneEnd INTEGER)")
c.execute("DROP TABLE IF EXISTS enhancers")
c.execute("CREATE TABLE enhancers(id INTEGER PRIMARY KEY, chrom INTEGER, enStart INTEGER, "+
"enEnd INTEGER, phastcons FLOAT,stage,enhancerType,exonCoverage FLOAT)")
c.execute("DROP TABLE IF EXISTS enhancersByGene")
c.execute("CREATE TABLE enhancersByGene(id INTEGER PRIMARY KEY, enhancerID INTEGER,"+
"mouseGeneID INTEGER,geneNames,relLoc,absLoc,dist Integer)")
#These three tables are used to generate homologous putative enhancer regions
c.execute("DROP TABLE IF EXISTS homologRegions")
c.execute("CREATE TABLE homologRegions(id INTEGER PRIMARY KEY,enhancerid,mafsid,speciesbuild,"+
"contig,start INTEGER,end INTEGER,mouseContig,mouseStart INTEGER,mouseEnd INTEGER,score FLOAT,size INTEGER,matchDir)")
c.execute("DROP TABLE IF EXISTS mergedHomologs")
c.execute("CREATE TABLE mergedHomologs(id INTEGER PRIMARY KEY,enhancerid,hregionid5 INTEGER,hregionid3 INTEGER,speciesbuild,contig,start INTEGER,end INTEGER,largestDiff INTEGER,seq)")
c.execute("DROP TABLE IF EXISTS mafs")
c.execute("CREATE TABLE mafs(id INTEGER PRIMARY KEY, enhancerid INTEGER, maf)")
###############################################################
###########################################################################
#ENHANCERS!
###########Find putative enhancer marks within 100kb of a gene of interest######################
###Import gene-of-interest (for mouse) list from tab-delimited file
for line in open("genesOfInterestMerged.tab","rU"):
(genename,chrom,geneStart,geneEnd)=line.rstrip().split("\t")
c.execute("INSERT INTO mouseGeneRegions(geneNames, chrom, geneStart, geneEnd)VALUES(?,?,?,?)",
(genename,chrom,geneStart,geneEnd))
###IMPORT GLI3 marks from Vokes et al. paper
enhancerType="Gli3"; stage=2
firstLine = True
for line in open("enhancers/vokesGliData"):
if firstLine: #ignore header line
firstLine=False;continue
bsl=line.rstrip().split("\t")
try:
bchrom=int(bsl[1][3:]);bstart=int(bsl[2]);bend=int(bsl[3])
c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,stage,enhancerType)VALUES(?,?,?,?,?)",
(chrom,bstart,bend,stage,"gli3"))
except(ValueError): #Ignore chromosome Y and MT (value error because they're not integers)
#end we're not interested in them anyway
pass
###Import list of known enhancers (for mouse)
for line in open("knownenhancers.tab","rU"):
sl=line.rstrip().split("\t") #no header in this file
(name,chrom,enStart,enEnd,ref) = sl
c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,enhancerType)VALUES(?,?,?,?)",
(int(chrom[3:]),enStart,enEnd,"knownEnhancer_"+name))
#Keeps only chromesome number, casts as float.
conn.commit() #force commit into database
##########################################################################################
####Import h3k27ac marks into temporary table
#Temporary table will be formatted similarly to the bed files
#Then we'll scan the table to merge nearby marks
c.execute("DROP TABLE IF EXISTS acMarks")
c.execute("CREATE TEMPORARY TABLE acMarks(id INTEGER PRIMARY KEY, chrom INTEGER,enStart INTEGER, enEnd INTEGER,stage)")
stages = [[2,"enhancers/GSE42413_e10.5_3_h3k27acEnrichedRegions0.00001.bed"],
[3,"enhancers/GSE42413_e11.5_2_h3k27acEnrichedRegions0.00001.bed"]]
for (stage,bedfile) in stages: #read the two bed files, for stage_2 and stage_3 of limb dev.
for line in open(bedfile):
bsl=line.rstrip().split("\t")
try:
bchrom=int(bsl[0][3:]);bstart=int(bsl[1]);bend=int(bsl[2])
c.execute("INSERT INTO acMarks(chrom,enStart,enEnd,stage)VALUES(?,?,?,?)",(bchrom,bstart,bend,stage))
except(ValueError):
pass
conn.commit()
#Index the temporary ac mark table
c.execute("DROP INDEX IF EXISTS iacm1");c.execute("CREATE INDEX iacm1 ON acMarks(chrom,enStart,enEnd,stage)")
#iterate through the temporary table from the bed file
#to merge adjacent H3k27ac marks
halfWindowSize=25;windowSize=50
#iterate by chromosome
for (chrom,) in c.execute("SELECT DISTINCT chrom FROM acMarks").fetchall():
#we make an empty numpy array, where each element represents a 25bp (half window)
#to do that, we get the last bp for this chromosome
#keep stage 2 and stage 3 marks in the same array
#so we can determine overlaps for stage 2 and stage 3
highestBp = c.execute("SELECT max(enEnd) FROM acMarks WHERE chrom=?",(chrom,)).fetchone()[0]
marks=numpy.zeros(math.ceil((highestBp+1)/halfWindowSize))
for (enStart,enEnd) in c.execute("SELECT enStart,enEnd FROM acMarks WHERE chrom=? AND stage=2",(chrom,)).fetchall():
xStart=int(math.floor(enStart/25));xEnd=int(math.ceil(enEnd/25))
#xStart and xEnd are array-translated position indices
marks[xStart:xEnd]=[2]*(xEnd-xStart) #write "2" in the array at indices corresponding to the start
#and stop position for this acetylation mark
for (enStart,enEnd) in c.execute("SELECT enStart,enEnd FROM acMarks WHERE chrom=? AND stage=3",(chrom,)).fetchall():
xStart=int(math.floor(enStart/25));xEnd=int(math.ceil(enEnd/25))
#xStart and xEnd are array-translated position indices
marks[xStart:xEnd]=marks[xStart:xEnd]+[3]*(xEnd-xStart) #add "3" to the elements that represented this segment
#look for runs of acetylation marks
nextRunStart=0
lmarks = list(int(i) for i in marks)
#iterate through the chromosome to scan start and ends of runs
while True:
for i in xrange(nextRunStart,len(marks)): #ac mark start scanning
#if we find an acetylation mark, we note where it starts
if lmarks[i]!=0:
runStart=i
runValue=lmarks[i]
break #found a mark, now it's time to look it's end
#scan for the end of the acetylation mark in this run
for i in xrange(runStart+1,len(marks)):
if lmarks[i]!=runValue:
runEnd=i-1
break #found the end of the mark!
convertedRunStart=25*(runStart-1);convertedRunEnd=25*(runEnd+1)
#change the array indices back to bp positions
#then insert into db
c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,stage,enhancerType)VALUES(?,?,?,?,?)",(chrom,convertedRunStart,convertedRunEnd,"2_3","H3k27ac_type"+str(runValue)))
nextRunStart=runEnd+1 #start scanning for next run of ac marks after the end of this mark
####Now merged into enhancersByGene#########
#Do a cross join of the mouseGeneRegions and enhancers tables
#Find enhancers within 100kb of each mouse gene
for (enhancersId,mouseGeneId,geneStart,geneEnd,geneNames,enStart,enEnd) in c.execute("SELECT"+
" enhancers.id,mg.id,geneStart,geneEnd,geneNames,enStart,enEnd FROM mouseGeneRegions as mg,"+
"enhancers WHERE mg.chrom = enhancers.chrom AND "+
"(abs(enStart-geneStart)<100000 or abs(enEnd-geneEnd)<100000)").fetchall():
if enEnd<geneStart:
relLoc="Upstream";absLoc="5'";dist=geneStart-enEnd
elif geneEnd<enStart:
relLoc="Downstream";absLoc="3'";dist=enStart-geneEnd
else:
relLoc="Overlap with gene of interest";absLoc="";dist=0
#insert the enhancer-gene pair into the db
c.execute("INSERT INTO enhancersByGene(enhancerID,mouseGeneID,geneNames,relLoc,absLoc,dist)"+
"VALUES(?,?,?,?,?,?)",(enhancersId,mouseGeneId,geneNames,relLoc,absLoc,dist))
#This crazy CREATE TABLE by SELECT statement creates a table that contains only the enhancers near genes
c.execute("DROP TABLE IF EXISTS enhancersOfInterest")
c.execute("CREATE TABLE enhancersOfInterest AS SELECT enhancers.id as enhancerID,"+
"chrom,enStart,enEnd FROM enhancers WHERE enhancers.id IN (SELECT enhancerID FROM enhancersByGene)")
c.execute("ALTER TABLE enhancersOfInterest ADD phastcons") #add conservation & exonCoverage fields to the table
c.execute("ALTER TABLE enhancersOfInterest ADD exonCoverage")
###############Phastcons####################################################
#Write our enhancers of interest into a bed file, so we can use UCSC's
#bigWigAverageOverBed program to get phastcons averages for the enhancers of interest
#their program is faster than anything we could write in python
bedoutfh=open("temp.bed",'w')
for (id,chrom,enStart,enEnd) in c.execute("SELECT DISTINCT enhancerID,chrom,enStart,enEnd FROM enhancersOfInterest").fetchall():
bedoutfh.write("chr%i\t%i\t%i\t%i\n"%(chrom,enStart,enEnd,id))
bedoutfh.close()
#run the external bigWigAverageOverBed file using the phastcons track
rv=subprocess.call(["phastcons/bigWigAverageOverBed","phastcons/mm10.60way.phastCons.bw","temp.bed","pcOut.tab"])
if rv!=0:
raise Exception("Phastcon run issue")
#now read in the enhancersOfInterest table now with phastcons averages and update
#the table with the phastcons scores
for line in open("phastcons/pcOut.tab","rU"):
(id,size,covered,sumVal,mean0,mean)= line.rstrip().split("\t")
c.execute("UPDATE enhancersOfInterest SET phastcons = ? WHERE enhancerID = ?",(mean,id))
###################################################################################
conn.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment