Created
September 16, 2013 15:46
-
-
Save BenLangmead/6582444 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "003_CG_InvertedIndex1" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# A simple example of a substring index; mirrors example from lecture notes\n", | |
"\n", | |
"# we're going to extract 4 substrings like this:\n", | |
"# t: CGTGCCTACTTACTTACAT\n", | |
"# substring 1: CGTGC\n", | |
"# substring 2: CCTAC\n", | |
"# substring 3: CTTAC\n", | |
"# substring 4: CTTAC\n", | |
"t = 'CGTGCCTACTTACTTACAT'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# From t, make list of pairs, where first pair item is substring, second is its offset\n", | |
"def substringize(t, ln, iv):\n", | |
" # ln = length of substrings to extract\n", | |
" # iv = distance between substings to extract; e.g. 1 means take *every* substring\n", | |
" pairs = []\n", | |
" # Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)\n", | |
" for i in xrange(0, len(t) - ln + 1, iv):\n", | |
" pairs.append((t[i:i+ln], i))\n", | |
" return pairs\n", | |
" # Could also have used list comprehension:\n", | |
" # return [ (t[i:i+ln], i) for i in xrange(0, len(t) - ln + 1, iv) ]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"substringize('CGTGCCTACTTACTTACAT', 5, 4)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Like substringize, but uses a map data structure\n", | |
"def mapize(t, ln, iv):\n", | |
" index = {}\n", | |
" for i in xrange(0, len(t) - ln + 1, iv):\n", | |
" sub = t[i:i+ln]\n", | |
" if sub in index:\n", | |
" index[sub].append(i) # already have an entry; append to it\n", | |
" else:\n", | |
" index[sub] = [i] # don't yet have entry, make new one\n", | |
" return index" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"index = mapize('CGTGCCTACTTACTTACAT', 5, 4)\n", | |
"index" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"{'CCTAC': [4], 'CGTGC': [0], 'CTTAC': [8, 12]}" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"p = 'CTTACTTA'" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# index: give me a hint where I should look for occurrences of p in t\n", | |
"index[p[:5]]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"[8, 12]" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 7 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment