Skip to content

Instantly share code, notes, and snippets.

@BenLangmead
Created September 16, 2013 16:14
Show Gist options
  • Save BenLangmead/6582836 to your computer and use it in GitHub Desktop.
Save BenLangmead/6582836 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "004_CG_InvertedIndex2"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Let's make a class that implements an inverted (substring) index where the map data\n",
"# structure is a sorted list of (substring, offset) pairs. Query w/ binary search.\n",
"\n",
"import sys\n",
"import bisect # for binary search\n",
"\n",
"class IndexSorted(object):\n",
" \n",
" def __init__(self, t, ln, ival):\n",
" \"\"\" Create index, extracting substrings of length 'ln' \"\"\"\n",
" self.t = t\n",
" self.ln = ln\n",
" self.ival = ival\n",
" self.index = []\n",
" for i in xrange(0, len(t)-ln+1):\n",
" self.index.append((t[i:i+ln], i)) # add <substr, offset> pair\n",
" self.index.sort() # sort pairs\n",
" \n",
" def query(self, p):\n",
" \"\"\" Return candidate alignments for p \"\"\"\n",
" st = bisect.bisect_left(self.index, (p[:self.ln], -1)) # binary search\n",
" en = bisect.bisect_right(self.index, (p[:self.ln], sys.maxint)) # binary search\n",
" hits = self.index[st:en] # this range of elements corresponds to the hits\n",
" return [ h[1] for h in hits ] # return just the offsets"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index = IndexSorted('CGTGCCTACTTACTTACAT', 5, 4)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index.query('CTTACTTA') # index: give us hints on where to look for this pattern"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 3,
"text": [
"[8, 12]"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index.query('TTTTTTTT') # index: give us hints on where to look for this pattern"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 4,
"text": [
"[]"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Now let's make a similar class but using a Python dictionary instead of a sorted\n",
"# list. A Python dictionary is basically a hash table.\n",
"\n",
"class IndexHash(object):\n",
" \n",
" def __init__(self, t, ln, ival):\n",
" \"\"\" Create index, extracting substrings of length 'ln' \"\"\"\n",
" self.t = t\n",
" self.ln = ln\n",
" self.ival = ival\n",
" self.index = {}\n",
" for i in xrange(0, len(t)-ln+1):\n",
" substr = t[i:i+ln]\n",
" if substr in self.index:\n",
" self.index[substr].append(i) # substring already in dictionary\n",
" else:\n",
" self.index[substr] = [i] # add to dictionary\n",
" \n",
" def query(self, p):\n",
" \"\"\" Return candidate alignments for p \"\"\"\n",
" return self.index.get(p[:self.ln], [])"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index2 = IndexHash('CGTGCCTACTTACTTACAT', 5, 4)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index2.query('CTTACTTA') # index: give us hints on where to look for this pattern"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 11,
"text": [
"[8, 12]"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"index2.query('TTTTTTTT') # index: give us hints on where to look for this pattern "
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 12,
"text": [
"[]"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment