{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "class DeBruijnGraph:\n", " ''' A de Bruijn directed multigraph built from a collection of\n", " strings. User supplies strings and k-mer length k. Nodes of\n", " the de Bruijn graph are k-1-mers and edges correspond to the\n", " k-mer that joins a left k-1-mer to a right k-1-mer. '''\n", " \n", " @staticmethod\n", " def chop(st, k):\n", " ''' Chop a string up into k mers of given length '''\n", " for i in xrange(0, len(st)-(k-1)):\n", " yield (st[i:i+k], st[i:i+k-1], st[i+1:i+k])\n", " \n", " class Node:\n", " ''' Node in a de Bruijn graph, representing a k-1 mer. We keep\n", " track of # of incoming/outgoing edges so it's easy to check\n", " for balanced, semi-balanced. '''\n", " \n", " def __init__(self, km1mer):\n", " self.km1mer = km1mer\n", " self.nin = 0\n", " self.nout = 0\n", " \n", " def isSemiBalanced(self):\n", " return abs(self.nin - self.nout) == 1\n", " \n", " def isBalanced(self):\n", " return self.nin == self.nout\n", " \n", " def __hash__(self):\n", " return hash(self.km1mer)\n", " \n", " def __str__(self):\n", " return self.km1mer\n", " \n", " def __init__(self, strIter, k, circularize=False):\n", " ''' Build de Bruijn multigraph given string iterator and k-mer\n", " length k '''\n", " self.G = {} # multimap from nodes to neighbors\n", " self.nodes = {} # maps k-1-mers to Node objects\n", " for st in strIter:\n", " if circularize:\n", " st += st[:k-1]\n", " for kmer, km1L, km1R in self.chop(st, k):\n", " nodeL, nodeR = None, None\n", " if km1L in self.nodes:\n", " nodeL = self.nodes[km1L]\n", " else:\n", " nodeL = self.nodes[km1L] = self.Node(km1L)\n", " if km1R in self.nodes:\n", " nodeR = self.nodes[km1R]\n", " else:\n", " nodeR = self.nodes[km1R] = self.Node(km1R)\n", " nodeL.nout += 1\n", " nodeR.nin += 1\n", " self.G.setdefault(nodeL, []).append(nodeR)\n", " # Iterate through nodes and tally how many are balanced,\n", " # semi-balanced, or neither\n", " self.nsemi, self.nbal, self.nneither = 0, 0, 0\n", " # Keep track of head and tail nodes in the case of a graph with\n", " # Eularian walk (not cycle)\n", " self.head, self.tail = None, None\n", " for node in self.nodes.itervalues():\n", " if node.isBalanced():\n", " self.nbal += 1\n", " elif node.isSemiBalanced():\n", " if node.nin == node.nout + 1:\n", " self.tail = node\n", " if node.nin == node.nout - 1:\n", " self.head = node\n", " self.nsemi += 1\n", " else:\n", " self.nneither += 1\n", " \n", " def nnodes(self):\n", " ''' Return # nodes '''\n", " return len(self.nodes)\n", " \n", " def nedges(self):\n", " ''' Return # edges '''\n", " return len(self.G)\n", " \n", " def hasEulerianWalk(self):\n", " ''' Return true iff graph has Eulerian walk. '''\n", " return self.nneither == 0 and self.nsemi == 2\n", " \n", " def hasEulerianCycle(self):\n", " ''' Return true iff graph has Eulerian cycle. '''\n", " return self.nneither == 0 and self.nsemi == 0\n", " \n", " def isEulerian(self):\n", " ''' Return true iff graph has Eulerian walk or cycle '''\n", " # technically, if it has an Eulerian walk\n", " return self.hasEulerianWalk() or self.hasEulerianCycle()\n", " \n", " def eulerianWalkOrCycle(self):\n", " ''' Find and return Eulerian walk or cycle (as appropriate) '''\n", " assert self.isEulerian()\n", " g = self.G\n", " if self.hasEulerianWalk():\n", " g = g.copy()\n", " assert self.head is not None\n", " assert self.tail is not None\n", " g.setdefault(self.tail, []).append(self.head)\n", " # graph g has an Eulerian cycle\n", " tour = []\n", " src = g.iterkeys().next() # pick arbitrary starting node\n", " \n", " def __visit(n):\n", " while len(g[n]) > 0:\n", " dst = g[n].pop()\n", " __visit(dst)\n", " tour.append(n)\n", " \n", " __visit(src)\n", " tour = tour[::-1][:-1] # reverse and then take all but last node\n", " \n", " if self.hasEulerianWalk():\n", " # Adjust node list so that it starts at head and ends at tail\n", " sti = tour.index(self.head)\n", " tour = tour[sti:] + tour[:sti]\n", " \n", " # Return node list\n", " return map(str, tour)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "g = DeBruijnGraph(['AAABBBA'], k=3)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "g.isEulerian(), g.hasEulerianWalk(), g.hasEulerianCycle()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "(True, True, False)" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "# the figure we drew in class had 4 nodes\n", "g.nnodes()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "4" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "g.eulerianWalkOrCycle()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ "['AA', 'AB', 'BB', 'BB', 'BA', 'AA']" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "g = DeBruijnGraph(['AAABBBBA'], k=3) # Add 1 more B to the run of Bs" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 6 }, { "cell_type": "code", "collapsed": false, "input": [ "g.isEulerian(), g.hasEulerianWalk(), g.hasEulerianCycle()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "(True, True, False)" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "# the figure we drew in class again had 4 nodes\n", "g.nnodes()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "4" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "g.eulerianWalkOrCycle()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ "['AA', 'AB', 'BB', 'BB', 'BB', 'BA', 'AA']" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "# circularize makes DeBruijnGraph treat string as circular,\n", "# returning to left-hand side at extreme right end\n", "g = DeBruijnGraph(['AAABBBBA'], k=3, circularize=True)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "g.isEulerian(), g.hasEulerianWalk(), g.hasEulerianCycle()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "(True, False, True)" ] } ], "prompt_number": 11 }, { "cell_type": "code", "collapsed": false, "input": [ "g.eulerianWalkOrCycle()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 12, "text": [ "['AA', 'AA', 'AB', 'BB', 'BB', 'BB', 'BA', 'AA']" ] } ], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "DeBruijnGraph([\"a_long_long_long_time\"], 5).eulerianWalkOrCycle()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 13, "text": [ "['a_lo',\n", " '_lon',\n", " 'long',\n", " 'ong_',\n", " 'ng_l',\n", " 'g_lo',\n", " '_lon',\n", " 'long',\n", " 'ong_',\n", " 'ng_l',\n", " 'g_lo',\n", " '_lon',\n", " 'long',\n", " 'ong_',\n", " 'ng_t',\n", " 'g_ti',\n", " '_tim',\n", " 'time']" ] } ], "prompt_number": 13 }, { "cell_type": "code", "collapsed": false, "input": [ "DeBruijnGraph(['a_long_long_long_time'], 5).eulerianWalkOrCycle().count('long')" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 14, "text": [ "3" ] } ], "prompt_number": 14 }, { "cell_type": "code", "collapsed": false, "input": [ "# see if we can get correct reconstruction at k=4\n", "walk = DeBruijnGraph(['to_every_thing_turn_turn_turn_there_is_a_season'], 4).eulerianWalkOrCycle()\n", "walk[0] + ''.join(map(lambda x: x[-1], walk[1:]))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 15, "text": [ "'to_every_thing_turn_turn_turn_there_is_a_season'" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "# that worked, but k=3 doesn't! unresolvable repeat at k=3\n", "walk = DeBruijnGraph(['to_every_thing_turn_turn_turn_there_is_a_season'], 3).eulerianWalkOrCycle()\n", "walk[0] + ''.join(map(lambda x: x[-1], walk[1:]))" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 16, "text": [ "'to_every_turn_turn_thing_turn_there_is_a_season'" ] } ], "prompt_number": 16 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 16 } ], "metadata": {} } ] }