Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Last active December 24, 2015 07:39
Show Gist options
  • Save BenLangmead/6765182 to your computer and use it in GitHub Desktop.
Save BenLangmead/6765182 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": ""
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Not a great way to build a suffix array, but we'll use it\n",
"# for the small examples here\n",
"def naiveBuildSA(t):\n",
" satups = sorted([(t[i:], i) for i in xrange(0, len(t))])\n",
" return map(lambda x: x[1], satups)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"naiveBuildSA('abaaba$') # works on a simple example"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 2,
"text": [
"[6, 5, 2, 3, 0, 4, 1]"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def binarySearchSA(t, sa, p):\n",
" assert t[-1] == '$' # t already has terminator\n",
" assert len(t) == len(sa) # sa is the suffix array for t\n",
" if len(t) == 1: return 1\n",
" l, r = 0, len(sa) # invariant: sa[l] < p < sa[r]\n",
" while True:\n",
" c = (l + r) // 2\n",
" # determine whether p < T[sa[c]:] by doing comparisons\n",
" # starting from left-hand sides of p and T[sa[c]:]\n",
" plt = True # assume p < T[sa[c]:] until proven otherwise\n",
" i = 0\n",
" while i < len(p) and sa[c]+i < len(t):\n",
" if p[i] < t[sa[c]+i]:\n",
" break # p < T[sa[c]:]\n",
" elif p[i] > t[sa[c]+i]:\n",
" plt = False\n",
" break # p > T[sa[c]:]\n",
" i += 1 # tied so far\n",
" if plt:\n",
" if c == l + 1: return c\n",
" r = c\n",
" else:\n",
" if c == r - 1: return r\n",
" l = c"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"t = 'abaaba$'\n",
"sa = naiveBuildSA(t)\n",
"binarySearchSA(t, sa, 'aba')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 4,
"text": [
"3"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"binarySearchSA(t, sa, 'bb') # p is greater than all suffixes"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
"7"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"binarySearchSA(t, sa, 'aa')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 6,
"text": [
"2"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def suffixLcp(t, toff, p):\n",
" i = 0\n",
" while i < len(p) and i + toff < len(t):\n",
" if p[i] != t[i + toff]:\n",
" return i\n",
" i += 1\n",
" return i"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"suffixLcp('abaaba$', 0, 'aba')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 8,
"text": [
"3"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"suffixLcp('abaaba$', 0, 'abab')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 9,
"text": [
"3"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"suffixLcp('abaaba$', 0, 'abaabaaba')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
"6"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def binarySearchSA_lcp1(t, sa, p):\n",
" assert t[-1] == '$' # t already has terminator\n",
" assert len(t) == len(sa) # sa is the suffix array for t\n",
" if len(t) == 1: return 1\n",
" l, r = 0, len(sa) # invariant: sa[l] < p < sa[r]\n",
" lcp_lp, lcp_rp = 0, 0\n",
" while True:\n",
" c = (l + r) // 2\n",
" # determine whether p < T[sa[c]:] by doing comparisons\n",
" # starting from left-hand sides of p and T[sa[c]:]\n",
" plt = True # assume p < T[sa[c]:] until proven otherwise\n",
" i = min(lcp_lp, lcp_rp)\n",
" while i < len(p) and sa[c]+i < len(t):\n",
" if p[i] < t[sa[c]+i]:\n",
" break # p < T[sa[c]:]\n",
" elif p[i] > t[sa[c]+i]:\n",
" plt = False\n",
" break # p > T[sa[c]:]\n",
" i += 1 # tied so far\n",
" if plt:\n",
" if c == l + 1: return c\n",
" r = c\n",
" lcp_rp = i\n",
" else:\n",
" if c == r - 1: return r\n",
" l = c\n",
" lcp_lp = i"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"binarySearchSA_lcp1(t, sa, 'aba')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"text": [
"3"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"binarySearchSA_lcp1(t, sa, 'bb')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
"7"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"binarySearchSA_lcp1(t, sa, 'aa')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": [
"2"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 14
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment