Created
July 31, 2017 11:00
-
-
Save kaizhao86/ed0bc3cc7ced5cf1bbbfe9d65d918b9a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from PyQt4 import Qt,QtCore,QtGui | |
| import operator,os,sys,sqlite3,time | |
| def joinMsats(parent,conn,c,MAXGAP,MINFLANK): | |
| c.execute("CREATE INDEX IF NOT EXISTS iML1 on msatLocations(readname)") | |
| c.execute("CREATE INDEX IF NOT EXISTS iML2 on msatLocations(readname,start)") | |
| parent.emit(QtCore.SIGNAL("log"),"%s Starting to join msat loci"%time.strftime('%I:%M%p')) | |
| """ | |
| Okay, this bit of code seems superfluous or at least overly complex, but the idea is this: | |
| If two msat loci are proximate (distance is within MAXGAP), then we can create a complex motif of the two merged together. | |
| However, we should also keep an independent copy, in case the joined version is too long for primer design. | |
| Anyway, we'll try all the copies, put them through the primer design process and see how it goes | |
| """ | |
| c.execute("DROP TABLE IF EXISTS msatFlankPairs") | |
| c.execute("CREATE TABLE msatFlankPairs(id INTEGER PRIMARY KEY, readname,flank5start,flank3end,repeatRegionStart,repeatRegionEnd,flankQualScore,avgFlankQualScore)") | |
| flankSizeIgnoredCt=0 | |
| for (readname,seq,qual) in c.execute("SELECT readname,seq,qual FROM reads").fetchall(): | |
| msatElements=[] #msatElements are the temporary list of elements | |
| for (start,end,id) in c.execute("SELECT start,end,id FROM msatLocations WHERE readname=? ORDER BY start",(readname,)).fetchall(): | |
| element={'start':start,'end':end,'id':id,'flank5start':0,'flank3end':len(seq)-1,'level':0} | |
| #naively assign start5 and end5. This is faster than doing it later, | |
| #but we'll replace it if there are more than one element | |
| msatElements.append(element) | |
| if len(msatElements)>1: | |
| msatStack=[msatElements] #a stack of msatelements. First, it's just the independent msat loci, | |
| #but then we'll start combining them. | |
| level=0 | |
| while True: | |
| msatElements=msatStack[-1]#get the last element, top thing on the list | |
| msatElements.sort(key=operator.itemgetter('start')) | |
| newElements=[] | |
| combinedElementIDs=[] #let's keep track of which indices we're combining | |
| level+=1 | |
| for i in range(1,len(msatElements)): | |
| last=msatElements[i-1] | |
| this=msatElements[i] | |
| if this['start']-last['end']<MAXGAP: | |
| #print "%s %s"%(last['start'], this['end']) | |
| newDict={'start':last['start'],'end':this['end'],'id':"syn"+str(last['id'])+","+ | |
| str(this['id']),'level':level} | |
| newElements.append(newDict) | |
| combinedElementIDs.append(i);combinedElementIDs.append(i-1) | |
| if len(newElements)>0: | |
| for i in range(len(msatElements)): | |
| if i not in combinedElementIDs: | |
| newElements.append(msatElements[i]) | |
| msatStack.append(newElements) #add uncombined elements | |
| #print "combined" | |
| else: | |
| #print "levels: %i"%(len(msatStack)) | |
| break #break out of the infinite loop once we're out of elements | |
| msatElements=[] #clear this out | |
| for levelIndex in range(len(msatStack)): #going bottom up should make sense | |
| for elementIndex in range(len(msatStack[levelIndex])): | |
| element = msatStack[levelIndex][elementIndex] | |
| if element not in msatElements: | |
| if elementIndex==0: | |
| element['flank5start']=0 | |
| else: | |
| element['flank5start']=msatStack[levelIndex][elementIndex-1]['end'] | |
| if elementIndex==len(msatStack[levelIndex])-1: | |
| element['flank3end']=len(seq)-1 | |
| else: | |
| element['flank3end']=msatStack[levelIndex][elementIndex+1]['start'] | |
| for element in msatElements: #let's go back through the list of dictionaries we made | |
| if (element['start']-element['flank5start'])<MINFLANK or (element['flank3end']-element['end'])<MINFLANK: | |
| flankSizeIgnoredCt+=1 | |
| continue | |
| #ord gets ascii value for each char, so we get the numeric quality store | |
| #we get both the lowest quality score and the average quality score for a segment | |
| flankQualScore=min(min(ord(s) for s in qual[start-MINFLANK:start]),min(ord(s) for s in qual[end:end+MINFLANK])) | |
| avgFlankQualScore=(sum(ord(s) for s in qual[start-MINFLANK:start])+sum(ord(s) for s in | |
| qual[end:end+MINFLANK]))/(2*MINFLANK) | |
| c.execute("INSERT INTO msatFlankPairs(readname,flank5start,flank3end,repeatRegionStart," | |
| "repeatRegionEnd,flankQualScore,avgFlankQualScore)VALUES(?,?,?,?,?,?,?)", | |
| (readname,element['flank5start'],element['flank3end'],element['start'], | |
| element['end'],flankQualScore,avgFlankQualScore)) | |
| #Try to find flanks: For each element, try to get at least MINFLANK | |
| c.execute("CREATE INDEX IF NOT EXISTS mfp1 ON msatFlankPairs(readname)") | |
| c.execute("CREATE INDEX IF NOT EXISTS mfp2 ON msatFlankPairs(flankQualScore desc, avgFlankQualScore desc)") | |
| w |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment