BenLangmead · September 16, 2013 15:46
diff --git a/003_CG_InvertedIndex1 b/003_CG_InvertedIndex1
 {
 "metadata": {
  "name": "003_CG_InvertedIndex1"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# A simple example of a substring index; mirrors example from lecture notes\n",
      "\n",
      "# we're going to extract 4 substrings like this:\n",
      "# t:           CGTGCCTACTTACTTACAT\n",
      "# substring 1: CGTGC\n",
      "# substring 2:     CCTAC\n",
      "# substring 3:         CTTAC\n",
      "# substring 4:             CTTAC\n",
      "t = 'CGTGCCTACTTACTTACAT'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# From t, make list of pairs, where first pair item is substring, second is its offset\n",
      "def substringize(t, ln, iv):\n",
      "    # ln = length of substrings to extract\n",
      "    # iv = distance between substings to extract; e.g. 1 means take *every* substring\n",
      "    pairs = []\n",
      "    # Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)\n",
      "    for i in xrange(0, len(t) - ln + 1, iv):\n",
      "        pairs.append((t[i:i+ln], i))\n",
      "    return pairs\n",
      "    # Could also have used list comprehension:\n",
      "    # return [ (t[i:i+ln], i) for i in xrange(0, len(t) - ln + 1, iv) ]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "substringize('CGTGCCTACTTACTTACAT', 5, 4)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 3,
       "text": [
        "[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Like substringize, but uses a map data structure\n",
      "def mapize(t, ln, iv):\n",
      "    index = {}\n",
      "    for i in xrange(0, len(t) - ln + 1, iv):\n",
      "        sub = t[i:i+ln]\n",
      "        if sub in index:\n",
      "            index[sub].append(i) # already have an entry; append to it\n",
      "        else:\n",
      "            index[sub] = [i] # don't yet have entry, make new one\n",
      "    return index"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "index = mapize('CGTGCCTACTTACTTACAT', 5, 4)\n",
      "index"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "{'CCTAC': [4], 'CGTGC': [0], 'CTTAC': [8, 12]}"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "p = 'CTTACTTA'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# index: give me a hint where I should look for occurrences of p in t\n",
      "index[p[:5]]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "pyout",
       "prompt_number": 7,
       "text": [
        "[8, 12]"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": "003_CG_InvertedIndex1"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# A simple example of a substring index; mirrors example from lecture notes\n",
	"\n",
	"# we're going to extract 4 substrings like this:\n",
	"# t: CGTGCCTACTTACTTACAT\n",
	"# substring 1: CGTGC\n",
	"# substring 2: CCTAC\n",
	"# substring 3: CTTAC\n",
	"# substring 4: CTTAC\n",
	"t = 'CGTGCCTACTTACTTACAT'"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 1
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# From t, make list of pairs, where first pair item is substring, second is its offset\n",
	"def substringize(t, ln, iv):\n",
	" # ln = length of substrings to extract\n",
	" # iv = distance between substings to extract; e.g. 1 means take every substring\n",
	" pairs = []\n",
	" # Loop below is like a Java/C loop saying: for(i = 0; i < len(t) - ln + 1; i += iv)\n",
	" for i in xrange(0, len(t) - ln + 1, iv):\n",
	" pairs.append((t[i:i+ln], i))\n",
	" return pairs\n",
	" # Could also have used list comprehension:\n",
	" # return [ (t[i:i+ln], i) for i in xrange(0, len(t) - ln + 1, iv) ]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"substringize('CGTGCCTACTTACTTACAT', 5, 4)"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 3,
	"text": [
	"[('CGTGC', 0), ('CCTAC', 4), ('CTTAC', 8), ('CTTAC', 12)]"
	]
	}
	],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# Like substringize, but uses a map data structure\n",
	"def mapize(t, ln, iv):\n",
	" index = {}\n",
	" for i in xrange(0, len(t) - ln + 1, iv):\n",
	" sub = t[i:i+ln]\n",
	" if sub in index:\n",
	" index[sub].append(i) # already have an entry; append to it\n",
	" else:\n",
	" index[sub] = [i] # don't yet have entry, make new one\n",
	" return index"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"index = mapize('CGTGCCTACTTACTTACAT', 5, 4)\n",
	"index"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 5,
	"text": [
	"{'CCTAC': [4], 'CGTGC': [0], 'CTTAC': [8, 12]}"
	]
	}
	],
	"prompt_number": 5
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"p = 'CTTACTTA'"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 6
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"# index: give me a hint where I should look for occurrences of p in t\n",
	"index[p[:5]]"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "pyout",
	"prompt_number": 7,
	"text": [
	"[8, 12]"
	]
	}
	],
	"prompt_number": 7
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 7
	}
	],
	"metadata": {}
	}
	]
	}
No results found