kaizhao86 · July 31, 2017 10:39
diff --git a/limb2enhancers.py b/limb2enhancers.py
 import Bio.SeqIO,gzip,fnmatch,itertools,math,numpy,os,re,sqlite3,subprocess,sys,tarfile,xlsxwriter
 """
 This program reads in our list of known mouse enhancers (from publications), Gli3-binding sites, 
 and H3k27ac marks. For the k27ac marks, we merge adjacent CHIP-seq binding positions
 """

 conn = sqlite3.connect("limb-nouveau.sqlite3")
  #Connect to local sqlite3 database
 c=conn.cursor()

 PATH = "/home/zhaok/Data/20140702_limb/"
 ############SET UP DATABASES AND FILES TO LOAD#############################
 #Delete existing tables so we can re-write the information

 #'mouseGeneRegions' contains locations for each mouse gene of interest
 #'Enhancers' table contains a record for each known or putative mouse enhancer (incl k27ac marks)
 #For each instance a mouse enhancer is within 100kb up- or down-stream of a gene of interest
 #there is an entry in the 'enhancersByGene' table, along with relative & absolute potions
 c.execute("DROP TABLE IF EXISTS mouseGeneRegions")
 c.execute("CREATE TABLE mouseGeneRegions(id INTEGER PRIMARY KEY, geneNames, chrom INTEGER, "+
  "geneStart INTEGER, geneEnd INTEGER)")
 c.execute("DROP TABLE IF EXISTS enhancers")
 c.execute("CREATE TABLE enhancers(id INTEGER PRIMARY KEY, chrom INTEGER, enStart INTEGER, "+
  "enEnd INTEGER, phastcons FLOAT,stage,enhancerType,exonCoverage FLOAT)")
 c.execute("DROP TABLE IF EXISTS enhancersByGene")
 c.execute("CREATE TABLE enhancersByGene(id INTEGER PRIMARY KEY, enhancerID INTEGER,"+
  "mouseGeneID INTEGER,geneNames,relLoc,absLoc,dist Integer)")

 #These three tables are used to generate homologous putative enhancer regions

 c.execute("DROP TABLE IF EXISTS homologRegions")
 c.execute("CREATE TABLE homologRegions(id INTEGER PRIMARY KEY,enhancerid,mafsid,speciesbuild,"+
  "contig,start INTEGER,end INTEGER,mouseContig,mouseStart INTEGER,mouseEnd INTEGER,score FLOAT,size INTEGER,matchDir)")
 c.execute("DROP TABLE IF EXISTS mergedHomologs")
 c.execute("CREATE TABLE mergedHomologs(id INTEGER PRIMARY KEY,enhancerid,hregionid5 INTEGER,hregionid3 INTEGER,speciesbuild,contig,start INTEGER,end INTEGER,largestDiff INTEGER,seq)")
 c.execute("DROP TABLE IF EXISTS mafs")
 c.execute("CREATE TABLE mafs(id INTEGER PRIMARY KEY, enhancerid INTEGER, maf)")
 ###############################################################

 ###########################################################################
 #ENHANCERS!

 ###########Find putative enhancer marks within 100kb of a gene of interest######################
 ###Import gene-of-interest (for mouse) list from tab-delimited file
 for line in open("genesOfInterestMerged.tab","rU"):
  (genename,chrom,geneStart,geneEnd)=line.rstrip().split("\t")
  c.execute("INSERT INTO mouseGeneRegions(geneNames, chrom, geneStart, geneEnd)VALUES(?,?,?,?)",
  (genename,chrom,geneStart,geneEnd))

 ###IMPORT GLI3 marks from Vokes et al. paper
 enhancerType="Gli3"; stage=2
 firstLine = True
 for line in open("enhancers/vokesGliData"):
  if firstLine: #ignore header line
    firstLine=False;continue
  bsl=line.rstrip().split("\t")
  try:
    bchrom=int(bsl[1][3:]);bstart=int(bsl[2]);bend=int(bsl[3])
    c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,stage,enhancerType)VALUES(?,?,?,?,?)",
      (chrom,bstart,bend,stage,"gli3"))
  except(ValueError): #Ignore chromosome Y and MT (value error because they're not integers)
                      #end we're not interested in them anyway
    pass

 ###Import list of known enhancers (for mouse)
 for line in open("knownenhancers.tab","rU"):
  sl=line.rstrip().split("\t") #no header in this file
  (name,chrom,enStart,enEnd,ref) = sl
  c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,enhancerType)VALUES(?,?,?,?)",
    (int(chrom[3:]),enStart,enEnd,"knownEnhancer_"+name))
    #Keeps only chromesome number, casts as float. 
 conn.commit() #force commit into database

 ##########################################################################################
 ####Import h3k27ac marks into temporary table
 #Temporary table will be formatted similarly to the bed files
 #Then we'll scan the table to merge nearby marks
 c.execute("DROP TABLE IF EXISTS acMarks")
 c.execute("CREATE TEMPORARY TABLE acMarks(id INTEGER PRIMARY KEY, chrom INTEGER,enStart INTEGER, enEnd INTEGER,stage)")
 stages = [[2,"enhancers/GSE42413_e10.5_3_h3k27acEnrichedRegions0.00001.bed"],
    [3,"enhancers/GSE42413_e11.5_2_h3k27acEnrichedRegions0.00001.bed"]]
 for (stage,bedfile) in stages: #read the two bed files, for stage_2 and stage_3 of limb dev.
  for line in open(bedfile):
    bsl=line.rstrip().split("\t")
    try:
      bchrom=int(bsl[0][3:]);bstart=int(bsl[1]);bend=int(bsl[2])
      c.execute("INSERT INTO acMarks(chrom,enStart,enEnd,stage)VALUES(?,?,?,?)",(bchrom,bstart,bend,stage)) 
    except(ValueError):
      pass
 conn.commit()
 #Index the temporary ac mark table
 c.execute("DROP INDEX IF EXISTS iacm1");c.execute("CREATE INDEX iacm1 ON acMarks(chrom,enStart,enEnd,stage)")

 #iterate through the temporary table from the bed file
 #to merge adjacent H3k27ac marks

 halfWindowSize=25;windowSize=50
 #iterate by chromosome
 for (chrom,) in c.execute("SELECT DISTINCT chrom FROM acMarks").fetchall():
  #we make an empty numpy array, where each element represents a 25bp (half window) 
  #to do that, we get the last bp for this chromosome
  #keep stage 2 and stage 3 marks in the same array
  #so we can determine overlaps for stage 2 and stage 3
  highestBp = c.execute("SELECT max(enEnd) FROM acMarks WHERE chrom=?",(chrom,)).fetchone()[0]
  marks=numpy.zeros(math.ceil((highestBp+1)/halfWindowSize))
  for (enStart,enEnd) in c.execute("SELECT enStart,enEnd FROM acMarks WHERE chrom=? AND stage=2",(chrom,)).fetchall():
    xStart=int(math.floor(enStart/25));xEnd=int(math.ceil(enEnd/25))
      #xStart and xEnd are array-translated position indices
    marks[xStart:xEnd]=[2]*(xEnd-xStart) #write "2" in the array at indices corresponding to the start 
      #and stop position for this acetylation mark
  for (enStart,enEnd) in c.execute("SELECT enStart,enEnd FROM acMarks WHERE chrom=? AND stage=3",(chrom,)).fetchall():
    xStart=int(math.floor(enStart/25));xEnd=int(math.ceil(enEnd/25))
      #xStart and xEnd are array-translated position indices
    marks[xStart:xEnd]=marks[xStart:xEnd]+[3]*(xEnd-xStart) #add "3" to the elements that represented this segment

  #look for runs of acetylation marks
  nextRunStart=0
  lmarks = list(int(i) for i in marks)
  #iterate through the chromosome to scan start and ends of runs
  while True:
      for i in xrange(nextRunStart,len(marks)): #ac mark start scanning
        #if we find an acetylation mark, we note where it starts
        if lmarks[i]!=0: 
          runStart=i
          runValue=lmarks[i]
          break #found a mark, now it's time to look it's end
      #scan for the end of the acetylation mark in this run
      for i in xrange(runStart+1,len(marks)):
        if lmarks[i]!=runValue:
          runEnd=i-1
          break #found the end of the mark!
      convertedRunStart=25*(runStart-1);convertedRunEnd=25*(runEnd+1)
        #change the array indices back to bp positions
        #then insert into db
      c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,stage,enhancerType)VALUES(?,?,?,?,?)",(chrom,convertedRunStart,convertedRunEnd,"2_3","H3k27ac_type"+str(runValue)))
      nextRunStart=runEnd+1 #start scanning for next run of ac marks after the end of this mark

 ####Now merged into enhancersByGene#########
 #Do a cross join of the mouseGeneRegions and enhancers tables
 #Find enhancers within 100kb of each mouse gene
 for (enhancersId,mouseGeneId,geneStart,geneEnd,geneNames,enStart,enEnd) in c.execute("SELECT"+
  " enhancers.id,mg.id,geneStart,geneEnd,geneNames,enStart,enEnd FROM mouseGeneRegions as mg,"+
  "enhancers WHERE mg.chrom = enhancers.chrom AND "+
  "(abs(enStart-geneStart)<100000 or abs(enEnd-geneEnd)<100000)").fetchall(): 
  if enEnd<geneStart: 
    relLoc="Upstream";absLoc="5'";dist=geneStart-enEnd      
  elif geneEnd<enStart:
    relLoc="Downstream";absLoc="3'";dist=enStart-geneEnd
  else:
    relLoc="Overlap with gene of interest";absLoc="";dist=0
  #insert the enhancer-gene pair into the db
  c.execute("INSERT INTO enhancersByGene(enhancerID,mouseGeneID,geneNames,relLoc,absLoc,dist)"+
    "VALUES(?,?,?,?,?,?)",(enhancersId,mouseGeneId,geneNames,relLoc,absLoc,dist))


 #This crazy CREATE TABLE by SELECT statement creates a table that contains only the enhancers near genes
 c.execute("DROP TABLE IF EXISTS enhancersOfInterest")
 c.execute("CREATE TABLE enhancersOfInterest AS SELECT enhancers.id as enhancerID,"+
  "chrom,enStart,enEnd FROM enhancers WHERE enhancers.id IN (SELECT enhancerID FROM enhancersByGene)")
 c.execute("ALTER TABLE enhancersOfInterest ADD phastcons") #add conservation & exonCoverage fields to the table
 c.execute("ALTER TABLE enhancersOfInterest ADD exonCoverage")
 ###############Phastcons####################################################
 #Write our enhancers of interest into a bed file, so we can use UCSC's 
 #bigWigAverageOverBed program to get phastcons averages for the enhancers of interest
 #their program is faster than anything we could write in python
 bedoutfh=open("temp.bed",'w')
 for (id,chrom,enStart,enEnd) in c.execute("SELECT DISTINCT enhancerID,chrom,enStart,enEnd FROM enhancersOfInterest").fetchall():
  bedoutfh.write("chr%i\t%i\t%i\t%i\n"%(chrom,enStart,enEnd,id))
 bedoutfh.close()

 #run the external bigWigAverageOverBed file using the phastcons track
 rv=subprocess.call(["phastcons/bigWigAverageOverBed","phastcons/mm10.60way.phastCons.bw","temp.bed","pcOut.tab"])
 if rv!=0:
  raise Exception("Phastcon run issue")

 #now read in the enhancersOfInterest table now with phastcons averages and update
 #the table with the phastcons scores
 for line in open("phastcons/pcOut.tab","rU"):
  (id,size,covered,sumVal,mean0,mean)= line.rstrip().split("\t")
  c.execute("UPDATE enhancersOfInterest SET phastcons = ? WHERE enhancerID = ?",(mean,id))
 ###################################################################################

 conn.commit()
	import Bio.SeqIO,gzip,fnmatch,itertools,math,numpy,os,re,sqlite3,subprocess,sys,tarfile,xlsxwriter
	"""
	This program reads in our list of known mouse enhancers (from publications), Gli3-binding sites,
	and H3k27ac marks. For the k27ac marks, we merge adjacent CHIP-seq binding positions
	"""

	conn = sqlite3.connect("limb-nouveau.sqlite3")
	#Connect to local sqlite3 database
	c=conn.cursor()

	PATH = "/home/zhaok/Data/20140702_limb/"
	############SET UP DATABASES AND FILES TO LOAD#############################
	#Delete existing tables so we can re-write the information

	#'mouseGeneRegions' contains locations for each mouse gene of interest
	#'Enhancers' table contains a record for each known or putative mouse enhancer (incl k27ac marks)
	#For each instance a mouse enhancer is within 100kb up- or down-stream of a gene of interest
	#there is an entry in the 'enhancersByGene' table, along with relative & absolute potions
	c.execute("DROP TABLE IF EXISTS mouseGeneRegions")
	c.execute("CREATE TABLE mouseGeneRegions(id INTEGER PRIMARY KEY, geneNames, chrom INTEGER, "+
	"geneStart INTEGER, geneEnd INTEGER)")
	c.execute("DROP TABLE IF EXISTS enhancers")
	c.execute("CREATE TABLE enhancers(id INTEGER PRIMARY KEY, chrom INTEGER, enStart INTEGER, "+
	"enEnd INTEGER, phastcons FLOAT,stage,enhancerType,exonCoverage FLOAT)")
	c.execute("DROP TABLE IF EXISTS enhancersByGene")
	c.execute("CREATE TABLE enhancersByGene(id INTEGER PRIMARY KEY, enhancerID INTEGER,"+
	"mouseGeneID INTEGER,geneNames,relLoc,absLoc,dist Integer)")

	#These three tables are used to generate homologous putative enhancer regions

	c.execute("DROP TABLE IF EXISTS homologRegions")
	c.execute("CREATE TABLE homologRegions(id INTEGER PRIMARY KEY,enhancerid,mafsid,speciesbuild,"+
	"contig,start INTEGER,end INTEGER,mouseContig,mouseStart INTEGER,mouseEnd INTEGER,score FLOAT,size INTEGER,matchDir)")
	c.execute("DROP TABLE IF EXISTS mergedHomologs")
	c.execute("CREATE TABLE mergedHomologs(id INTEGER PRIMARY KEY,enhancerid,hregionid5 INTEGER,hregionid3 INTEGER,speciesbuild,contig,start INTEGER,end INTEGER,largestDiff INTEGER,seq)")
	c.execute("DROP TABLE IF EXISTS mafs")
	c.execute("CREATE TABLE mafs(id INTEGER PRIMARY KEY, enhancerid INTEGER, maf)")
	###############################################################

	###########################################################################
	#ENHANCERS!

	###########Find putative enhancer marks within 100kb of a gene of interest######################
	###Import gene-of-interest (for mouse) list from tab-delimited file
	for line in open("genesOfInterestMerged.tab","rU"):
	(genename,chrom,geneStart,geneEnd)=line.rstrip().split("\t")
	c.execute("INSERT INTO mouseGeneRegions(geneNames, chrom, geneStart, geneEnd)VALUES(?,?,?,?)",
	(genename,chrom,geneStart,geneEnd))

	###IMPORT GLI3 marks from Vokes et al. paper
	enhancerType="Gli3"; stage=2
	firstLine = True
	for line in open("enhancers/vokesGliData"):
	if firstLine: #ignore header line
	firstLine=False;continue
	bsl=line.rstrip().split("\t")
	try:
	bchrom=int(bsl[1][3:]);bstart=int(bsl[2]);bend=int(bsl[3])
	c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,stage,enhancerType)VALUES(?,?,?,?,?)",
	(chrom,bstart,bend,stage,"gli3"))
	except(ValueError): #Ignore chromosome Y and MT (value error because they're not integers)
	#end we're not interested in them anyway
	pass

	###Import list of known enhancers (for mouse)
	for line in open("knownenhancers.tab","rU"):
	sl=line.rstrip().split("\t") #no header in this file
	(name,chrom,enStart,enEnd,ref) = sl
	c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,enhancerType)VALUES(?,?,?,?)",
	(int(chrom[3:]),enStart,enEnd,"knownEnhancer_"+name))
	#Keeps only chromesome number, casts as float.
	conn.commit() #force commit into database

	##########################################################################################
	####Import h3k27ac marks into temporary table
	#Temporary table will be formatted similarly to the bed files
	#Then we'll scan the table to merge nearby marks
	c.execute("DROP TABLE IF EXISTS acMarks")
	c.execute("CREATE TEMPORARY TABLE acMarks(id INTEGER PRIMARY KEY, chrom INTEGER,enStart INTEGER, enEnd INTEGER,stage)")
	stages = [[2,"enhancers/GSE42413_e10.5_3_h3k27acEnrichedRegions0.00001.bed"],
	[3,"enhancers/GSE42413_e11.5_2_h3k27acEnrichedRegions0.00001.bed"]]
	for (stage,bedfile) in stages: #read the two bed files, for stage_2 and stage_3 of limb dev.
	for line in open(bedfile):
	bsl=line.rstrip().split("\t")
	try:
	bchrom=int(bsl[0][3:]);bstart=int(bsl[1]);bend=int(bsl[2])
	c.execute("INSERT INTO acMarks(chrom,enStart,enEnd,stage)VALUES(?,?,?,?)",(bchrom,bstart,bend,stage))
	except(ValueError):
	pass
	conn.commit()
	#Index the temporary ac mark table
	c.execute("DROP INDEX IF EXISTS iacm1");c.execute("CREATE INDEX iacm1 ON acMarks(chrom,enStart,enEnd,stage)")

	#iterate through the temporary table from the bed file
	#to merge adjacent H3k27ac marks

	halfWindowSize=25;windowSize=50
	#iterate by chromosome
	for (chrom,) in c.execute("SELECT DISTINCT chrom FROM acMarks").fetchall():
	#we make an empty numpy array, where each element represents a 25bp (half window)
	#to do that, we get the last bp for this chromosome
	#keep stage 2 and stage 3 marks in the same array
	#so we can determine overlaps for stage 2 and stage 3
	highestBp = c.execute("SELECT max(enEnd) FROM acMarks WHERE chrom=?",(chrom,)).fetchone()[0]
	marks=numpy.zeros(math.ceil((highestBp+1)/halfWindowSize))
	for (enStart,enEnd) in c.execute("SELECT enStart,enEnd FROM acMarks WHERE chrom=? AND stage=2",(chrom,)).fetchall():
	xStart=int(math.floor(enStart/25));xEnd=int(math.ceil(enEnd/25))
	#xStart and xEnd are array-translated position indices
	marks[xStart:xEnd]=[2]*(xEnd-xStart) #write "2" in the array at indices corresponding to the start
	#and stop position for this acetylation mark
	for (enStart,enEnd) in c.execute("SELECT enStart,enEnd FROM acMarks WHERE chrom=? AND stage=3",(chrom,)).fetchall():
	xStart=int(math.floor(enStart/25));xEnd=int(math.ceil(enEnd/25))
	#xStart and xEnd are array-translated position indices
	marks[xStart:xEnd]=marks[xStart:xEnd]+[3]*(xEnd-xStart) #add "3" to the elements that represented this segment

	#look for runs of acetylation marks
	nextRunStart=0
	lmarks = list(int(i) for i in marks)
	#iterate through the chromosome to scan start and ends of runs
	while True:
	for i in xrange(nextRunStart,len(marks)): #ac mark start scanning
	#if we find an acetylation mark, we note where it starts
	if lmarks[i]!=0:
	runStart=i
	runValue=lmarks[i]
	break #found a mark, now it's time to look it's end
	#scan for the end of the acetylation mark in this run
	for i in xrange(runStart+1,len(marks)):
	if lmarks[i]!=runValue:
	runEnd=i-1
	break #found the end of the mark!
	convertedRunStart=25(runStart-1);convertedRunEnd=25(runEnd+1)
	#change the array indices back to bp positions
	#then insert into db
	c.execute("INSERT INTO enhancers(chrom,enStart,enEnd,stage,enhancerType)VALUES(?,?,?,?,?)",(chrom,convertedRunStart,convertedRunEnd,"2_3","H3k27ac_type"+str(runValue)))
	nextRunStart=runEnd+1 #start scanning for next run of ac marks after the end of this mark

	####Now merged into enhancersByGene#########
	#Do a cross join of the mouseGeneRegions and enhancers tables
	#Find enhancers within 100kb of each mouse gene
	for (enhancersId,mouseGeneId,geneStart,geneEnd,geneNames,enStart,enEnd) in c.execute("SELECT"+
	" enhancers.id,mg.id,geneStart,geneEnd,geneNames,enStart,enEnd FROM mouseGeneRegions as mg,"+
	"enhancers WHERE mg.chrom = enhancers.chrom AND "+
	"(abs(enStart-geneStart)<100000 or abs(enEnd-geneEnd)<100000)").fetchall():
	if enEnd<geneStart:
	relLoc="Upstream";absLoc="5'";dist=geneStart-enEnd
	elif geneEnd<enStart:
	relLoc="Downstream";absLoc="3'";dist=enStart-geneEnd
	else:
	relLoc="Overlap with gene of interest";absLoc="";dist=0
	#insert the enhancer-gene pair into the db
	c.execute("INSERT INTO enhancersByGene(enhancerID,mouseGeneID,geneNames,relLoc,absLoc,dist)"+
	"VALUES(?,?,?,?,?,?)",(enhancersId,mouseGeneId,geneNames,relLoc,absLoc,dist))


	#This crazy CREATE TABLE by SELECT statement creates a table that contains only the enhancers near genes
	c.execute("DROP TABLE IF EXISTS enhancersOfInterest")
	c.execute("CREATE TABLE enhancersOfInterest AS SELECT enhancers.id as enhancerID,"+
	"chrom,enStart,enEnd FROM enhancers WHERE enhancers.id IN (SELECT enhancerID FROM enhancersByGene)")
	c.execute("ALTER TABLE enhancersOfInterest ADD phastcons") #add conservation & exonCoverage fields to the table
	c.execute("ALTER TABLE enhancersOfInterest ADD exonCoverage")
	###############Phastcons####################################################
	#Write our enhancers of interest into a bed file, so we can use UCSC's
	#bigWigAverageOverBed program to get phastcons averages for the enhancers of interest
	#their program is faster than anything we could write in python
	bedoutfh=open("temp.bed",'w')
	for (id,chrom,enStart,enEnd) in c.execute("SELECT DISTINCT enhancerID,chrom,enStart,enEnd FROM enhancersOfInterest").fetchall():
	bedoutfh.write("chr%i\t%i\t%i\t%i\n"%(chrom,enStart,enEnd,id))
	bedoutfh.close()

	#run the external bigWigAverageOverBed file using the phastcons track
	rv=subprocess.call(["phastcons/bigWigAverageOverBed","phastcons/mm10.60way.phastCons.bw","temp.bed","pcOut.tab"])
	if rv!=0:
	raise Exception("Phastcon run issue")

	#now read in the enhancersOfInterest table now with phastcons averages and update
	#the table with the phastcons scores
	for line in open("phastcons/pcOut.tab","rU"):
	(id,size,covered,sumVal,mean0,mean)= line.rstrip().split("\t")
	c.execute("UPDATE enhancersOfInterest SET phastcons = ? WHERE enhancerID = ?",(mean,id))
	###################################################################################

	conn.commit()
No results found