Created
September 18, 2013 01:37
-
-
Save BenLangmead/6603340 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "CG_BoyerMooreApprox" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# First we make a function that splits a string p up into a set of\n", | |
"# non-overlapping, non-empty substrings.\n", | |
"\n", | |
"def partition(p, pieces=2):\n", | |
" assert len(p) >= pieces\n", | |
" base, mod = len(p) / pieces, len(p) % pieces\n", | |
" idx = 0\n", | |
" ps = []\n", | |
" modAdjust = 1\n", | |
" for i in xrange(0, pieces):\n", | |
" if i >= mod: modAdjust = 0\n", | |
" newIdx = idx + base + modAdjust\n", | |
" ps.append(p[idx:newIdx])\n", | |
" idx = newIdx\n", | |
" return ps" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def bmApproximate(p, t, k, alph=\"ACGT\"):\n", | |
" \"\"\" Use the pigeonhole principle together with Boyer-Moore to find\n", | |
" approximate matches with up to a specified number of mismatches. \"\"\"\n", | |
" assert len(p) >= k+1\n", | |
" ps = partition(p, k+1) # split p into list of k+1 non-empty, non-overlapping substrings\n", | |
" off = 0 # offset into p of current partition\n", | |
" occurrences = set() # note we might see the same occurrence >1 time\n", | |
" for pi in ps: # for each partition\n", | |
" # NOTE: I haven't given the implementation for the BMPreprocessing object.\n", | |
" # It implements the Boyer-Moore skipping rules as we discussed in class.\n", | |
" bm_prep = BMPreprocessing(pi, alph=alph) # BM preprocess the partition\n", | |
" for hit in bm_prep.match(t)[0]:\n", | |
" if hit - off < 0: continue # pattern falls off left end of T?\n", | |
" if hit + len(p) - off > len(t): continue # falls off right end?\n", | |
" # Count mismatches to left and right of the matching partition\n", | |
" nmm = 0\n", | |
" for i in range(0, off) + range(off+len(pi), len(p)):\n", | |
" if t[hit-off+i] != p[i]:\n", | |
" nmm += 1\n", | |
" if nmm > k: break # exceeded maximum # mismatches\n", | |
" if nmm <= k:\n", | |
" occurrences.add(hit-off) # approximate match\n", | |
" off += len(pi) # Update offset of current partition\n", | |
" return sorted(list(occurrences))" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bmApproximate('needle', 'needle noodle nargle', 2, alph='abcdefghijklmnopqrstuvwxyz ')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"[0, 7]" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment