Skip to content

Instantly share code, notes, and snippets.

@kaizhao86
Created July 31, 2017 11:00
Show Gist options
  • Select an option

  • Save kaizhao86/ed0bc3cc7ced5cf1bbbfe9d65d918b9a to your computer and use it in GitHub Desktop.

Select an option

Save kaizhao86/ed0bc3cc7ced5cf1bbbfe9d65d918b9a to your computer and use it in GitHub Desktop.
from PyQt4 import Qt,QtCore,QtGui
import operator,os,sys,sqlite3,time
def joinMsats(parent,conn,c,MAXGAP,MINFLANK):
c.execute("CREATE INDEX IF NOT EXISTS iML1 on msatLocations(readname)")
c.execute("CREATE INDEX IF NOT EXISTS iML2 on msatLocations(readname,start)")
parent.emit(QtCore.SIGNAL("log"),"%s Starting to join msat loci"%time.strftime('%I:%M%p'))
"""
Okay, this bit of code seems superfluous or at least overly complex, but the idea is this:
If two msat loci are proximate (distance is within MAXGAP), then we can create a complex motif of the two merged together.
However, we should also keep an independent copy, in case the joined version is too long for primer design.
Anyway, we'll try all the copies, put them through the primer design process and see how it goes
"""
c.execute("DROP TABLE IF EXISTS msatFlankPairs")
c.execute("CREATE TABLE msatFlankPairs(id INTEGER PRIMARY KEY, readname,flank5start,flank3end,repeatRegionStart,repeatRegionEnd,flankQualScore,avgFlankQualScore)")
flankSizeIgnoredCt=0
for (readname,seq,qual) in c.execute("SELECT readname,seq,qual FROM reads").fetchall():
msatElements=[] #msatElements are the temporary list of elements
for (start,end,id) in c.execute("SELECT start,end,id FROM msatLocations WHERE readname=? ORDER BY start",(readname,)).fetchall():
element={'start':start,'end':end,'id':id,'flank5start':0,'flank3end':len(seq)-1,'level':0}
#naively assign start5 and end5. This is faster than doing it later,
#but we'll replace it if there are more than one element
msatElements.append(element)
if len(msatElements)>1:
msatStack=[msatElements] #a stack of msatelements. First, it's just the independent msat loci,
#but then we'll start combining them.
level=0
while True:
msatElements=msatStack[-1]#get the last element, top thing on the list
msatElements.sort(key=operator.itemgetter('start'))
newElements=[]
combinedElementIDs=[] #let's keep track of which indices we're combining
level+=1
for i in range(1,len(msatElements)):
last=msatElements[i-1]
this=msatElements[i]
if this['start']-last['end']<MAXGAP:
#print "%s %s"%(last['start'], this['end'])
newDict={'start':last['start'],'end':this['end'],'id':"syn"+str(last['id'])+","+
str(this['id']),'level':level}
newElements.append(newDict)
combinedElementIDs.append(i);combinedElementIDs.append(i-1)
if len(newElements)>0:
for i in range(len(msatElements)):
if i not in combinedElementIDs:
newElements.append(msatElements[i])
msatStack.append(newElements) #add uncombined elements
#print "combined"
else:
#print "levels: %i"%(len(msatStack))
break #break out of the infinite loop once we're out of elements
msatElements=[] #clear this out
for levelIndex in range(len(msatStack)): #going bottom up should make sense
for elementIndex in range(len(msatStack[levelIndex])):
element = msatStack[levelIndex][elementIndex]
if element not in msatElements:
if elementIndex==0:
element['flank5start']=0
else:
element['flank5start']=msatStack[levelIndex][elementIndex-1]['end']
if elementIndex==len(msatStack[levelIndex])-1:
element['flank3end']=len(seq)-1
else:
element['flank3end']=msatStack[levelIndex][elementIndex+1]['start']
for element in msatElements: #let's go back through the list of dictionaries we made
if (element['start']-element['flank5start'])<MINFLANK or (element['flank3end']-element['end'])<MINFLANK:
flankSizeIgnoredCt+=1
continue
#ord gets ascii value for each char, so we get the numeric quality store
#we get both the lowest quality score and the average quality score for a segment
flankQualScore=min(min(ord(s) for s in qual[start-MINFLANK:start]),min(ord(s) for s in qual[end:end+MINFLANK]))
avgFlankQualScore=(sum(ord(s) for s in qual[start-MINFLANK:start])+sum(ord(s) for s in
qual[end:end+MINFLANK]))/(2*MINFLANK)
c.execute("INSERT INTO msatFlankPairs(readname,flank5start,flank3end,repeatRegionStart,"
"repeatRegionEnd,flankQualScore,avgFlankQualScore)VALUES(?,?,?,?,?,?,?)",
(readname,element['flank5start'],element['flank3end'],element['start'],
element['end'],flankQualScore,avgFlankQualScore))
#Try to find flanks: For each element, try to get at least MINFLANK
c.execute("CREATE INDEX IF NOT EXISTS mfp1 ON msatFlankPairs(readname)")
c.execute("CREATE INDEX IF NOT EXISTS mfp2 ON msatFlankPairs(flankQualScore desc, avgFlankQualScore desc)")
w
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment