Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Created September 18, 2013 01:37
Show Gist options
  • Save BenLangmead/6603340 to your computer and use it in GitHub Desktop.
Save BenLangmead/6603340 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "CG_BoyerMooreApprox"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# First we make a function that splits a string p up into a set of\n",
"# non-overlapping, non-empty substrings.\n",
"\n",
"def partition(p, pieces=2):\n",
" assert len(p) >= pieces\n",
" base, mod = len(p) / pieces, len(p) % pieces\n",
" idx = 0\n",
" ps = []\n",
" modAdjust = 1\n",
" for i in xrange(0, pieces):\n",
" if i >= mod: modAdjust = 0\n",
" newIdx = idx + base + modAdjust\n",
" ps.append(p[idx:newIdx])\n",
" idx = newIdx\n",
" return ps"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def bmApproximate(p, t, k, alph=\"ACGT\"):\n",
" \"\"\" Use the pigeonhole principle together with Boyer-Moore to find\n",
" approximate matches with up to a specified number of mismatches. \"\"\"\n",
" assert len(p) >= k+1\n",
" ps = partition(p, k+1) # split p into list of k+1 non-empty, non-overlapping substrings\n",
" off = 0 # offset into p of current partition\n",
" occurrences = set() # note we might see the same occurrence >1 time\n",
" for pi in ps: # for each partition\n",
" # NOTE: I haven't given the implementation for the BMPreprocessing object.\n",
" # It implements the Boyer-Moore skipping rules as we discussed in class.\n",
" bm_prep = BMPreprocessing(pi, alph=alph) # BM preprocess the partition\n",
" for hit in bm_prep.match(t)[0]:\n",
" if hit - off < 0: continue # pattern falls off left end of T?\n",
" if hit + len(p) - off > len(t): continue # falls off right end?\n",
" # Count mismatches to left and right of the matching partition\n",
" nmm = 0\n",
" for i in range(0, off) + range(off+len(pi), len(p)):\n",
" if t[hit-off+i] != p[i]:\n",
" nmm += 1\n",
" if nmm > k: break # exceeded maximum # mismatches\n",
" if nmm <= k:\n",
" occurrences.add(hit-off) # approximate match\n",
" off += len(pi) # Update offset of current partition\n",
" return sorted(list(occurrences))"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bmApproximate('needle', 'needle noodle nargle', 2, alph='abcdefghijklmnopqrstuvwxyz ')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 4,
"text": [
"[0, 7]"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment