Last active
December 24, 2015 07:39
-
-
Save BenLangmead/6765182 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Not a great way to build a suffix array, but we'll use it\n", | |
"# for the small examples here\n", | |
"def naiveBuildSA(t):\n", | |
" satups = sorted([(t[i:], i) for i in xrange(0, len(t))])\n", | |
" return map(lambda x: x[1], satups)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"naiveBuildSA('abaaba$') # works on a simple example" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
"[6, 5, 2, 3, 0, 4, 1]" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def binarySearchSA(t, sa, p):\n", | |
" assert t[-1] == '$' # t already has terminator\n", | |
" assert len(t) == len(sa) # sa is the suffix array for t\n", | |
" if len(t) == 1: return 1\n", | |
" l, r = 0, len(sa) # invariant: sa[l] < p < sa[r]\n", | |
" while True:\n", | |
" c = (l + r) // 2\n", | |
" # determine whether p < T[sa[c]:] by doing comparisons\n", | |
" # starting from left-hand sides of p and T[sa[c]:]\n", | |
" plt = True # assume p < T[sa[c]:] until proven otherwise\n", | |
" i = 0\n", | |
" while i < len(p) and sa[c]+i < len(t):\n", | |
" if p[i] < t[sa[c]+i]:\n", | |
" break # p < T[sa[c]:]\n", | |
" elif p[i] > t[sa[c]+i]:\n", | |
" plt = False\n", | |
" break # p > T[sa[c]:]\n", | |
" i += 1 # tied so far\n", | |
" if plt:\n", | |
" if c == l + 1: return c\n", | |
" r = c\n", | |
" else:\n", | |
" if c == r - 1: return r\n", | |
" l = c" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"t = 'abaaba$'\n", | |
"sa = naiveBuildSA(t)\n", | |
"binarySearchSA(t, sa, 'aba')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"3" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"binarySearchSA(t, sa, 'bb') # p is greater than all suffixes" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"7" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"binarySearchSA(t, sa, 'aa')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"2" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def suffixLcp(t, toff, p):\n", | |
" i = 0\n", | |
" while i < len(p) and i + toff < len(t):\n", | |
" if p[i] != t[i + toff]:\n", | |
" return i\n", | |
" i += 1\n", | |
" return i" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"suffixLcp('abaaba$', 0, 'aba')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"3" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"suffixLcp('abaaba$', 0, 'abab')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
"3" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"suffixLcp('abaaba$', 0, 'abaabaaba')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
"6" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def binarySearchSA_lcp1(t, sa, p):\n", | |
" assert t[-1] == '$' # t already has terminator\n", | |
" assert len(t) == len(sa) # sa is the suffix array for t\n", | |
" if len(t) == 1: return 1\n", | |
" l, r = 0, len(sa) # invariant: sa[l] < p < sa[r]\n", | |
" lcp_lp, lcp_rp = 0, 0\n", | |
" while True:\n", | |
" c = (l + r) // 2\n", | |
" # determine whether p < T[sa[c]:] by doing comparisons\n", | |
" # starting from left-hand sides of p and T[sa[c]:]\n", | |
" plt = True # assume p < T[sa[c]:] until proven otherwise\n", | |
" i = min(lcp_lp, lcp_rp)\n", | |
" while i < len(p) and sa[c]+i < len(t):\n", | |
" if p[i] < t[sa[c]+i]:\n", | |
" break # p < T[sa[c]:]\n", | |
" elif p[i] > t[sa[c]+i]:\n", | |
" plt = False\n", | |
" break # p > T[sa[c]:]\n", | |
" i += 1 # tied so far\n", | |
" if plt:\n", | |
" if c == l + 1: return c\n", | |
" r = c\n", | |
" lcp_rp = i\n", | |
" else:\n", | |
" if c == r - 1: return r\n", | |
" l = c\n", | |
" lcp_lp = i" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"binarySearchSA_lcp1(t, sa, 'aba')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
"3" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"binarySearchSA_lcp1(t, sa, 'bb')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
"7" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"binarySearchSA_lcp1(t, sa, 'aa')" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 14, | |
"text": [ | |
"2" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 14 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment