Created
May 8, 2014 22:12
-
-
Save BenLangmead/cf7f0ad2022a3f49c1bc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:0e059d6141a29e725dcfd5b05c7fa0fe7e20abcd0698fa34124a7c847f09627d" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Examples of Radix sort\n", | |
"\n", | |
"Both least-significant-digit and most-significant-digit versions. Examples use DNA alphabet." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Least significant digit" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Start with a function that conducts a single pass." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"from collections import defaultdict\n", | |
"\n", | |
"def radix_pass(strs, ordr, depth):\n", | |
" \"\"\" Given a collection of same-length strings and depth, return a\n", | |
" permutation that stably sorts the strings according to character\n", | |
" at that depth \"\"\"\n", | |
" buckets = defaultdict(list)\n", | |
" for i in ordr:\n", | |
" buckets[strs[i][depth]].append(i)\n", | |
" return [x for sublist in [buckets[c] for c in '$ACGTn'] for x in sublist]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"strs = ['A', 'A', 'C', 'G', 'G', 'G', 'n']\n", | |
"radix_pass(strs, range(len(strs)), 0)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
"[0, 1, 2, 3, 4, 5, 6]" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"strs = ['A', 'G', 'A', 'G', 'C', 'G', 'n']\n", | |
"radix_pass(strs, range(len(strs)), 0)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": [ | |
"[0, 2, 4, 1, 3, 5, 6]" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Chain two `radix_pass` calls together to get overall sorted order." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# First call\n", | |
"strs = ['AG', 'CG', 'AA', 'GA', 'TC', 'GT', 'Tn', 'nn', 'nC']\n", | |
"lsd1 = radix_pass(strs, range(len(strs)), 1)\n", | |
"lsd1" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"[2, 3, 4, 8, 0, 1, 5, 6, 7]" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Second call, using result from first\n", | |
"radix_pass(strs, lsd1, 0)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": [ | |
"[2, 0, 1, 3, 5, 4, 6, 8, 7]" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"To completely sort the strings, `radix_lsd` does all the passes." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def radix_lsd(strs):\n", | |
" \"\"\" Least-significant-digit (LSD) radix sort on collection of\n", | |
" same-length strings. Returns a permutation that puts the list\n", | |
" in stable-sorted order. \"\"\"\n", | |
" assert len(strs) > 0\n", | |
" ordr = range(len(strs))\n", | |
" for i in xrange(len(strs[0])-1, -1, -1):\n", | |
" ordr = radix_pass(strs, ordr, i)\n", | |
" return ordr" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"strs = ['AG', 'CG', 'AA', 'GA', 'TC', 'GT', 'Tn', 'nn', 'nC']\n", | |
"radix_lsd(strs)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": [ | |
"[2, 0, 1, 3, 5, 4, 6, 8, 7]" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Most significant digit" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"MSD radix sort with a single recursive function." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"def radix_msd_helper(strs, ordr, depth):\n", | |
" \"\"\" Most-significant-digit (MSD) radix sort on collection of\n", | |
" same-length strings. Returns a permutation that puts the list\n", | |
" in stable-sorted order. \"\"\"\n", | |
" if len(ordr) <= 1 or depth >= len(strs[0]):\n", | |
" return ordr # bases cases: 1 elt in list, or we've exhausted characters\n", | |
" buckets = defaultdict(list)\n", | |
" for i in ordr:\n", | |
" buckets[strs[i][depth]].append(i)\n", | |
" return [x for sublist in [radix_msd_helper(strs, buckets[c], depth+1) for c in '$ACGTn'] for x in sublist]\n", | |
"\n", | |
"def radix_msd(strs):\n", | |
" return radix_msd_helper(strs, range(len(strs)), 0)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"strs = ['AG', 'CG', 'AA', 'GA', 'TC', 'GT', 'Tn', 'nn', 'nC']\n", | |
"radix_msd(strs)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
"[2, 0, 1, 3, 5, 4, 6, 8, 7]" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"strs = ['GATTACA', 'GATTAAA', 'GAATACA', 'GATAACA', 'AATTAAA']\n", | |
"assert radix_msd(strs) == radix_lsd(strs)\n", | |
"radix_msd(strs)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
"[4, 2, 3, 1, 0]" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment