Created
April 10, 2015 17:54
-
-
Save cmrivers/660990e009bbf2f7a788 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Populating the interactive namespace from numpy and matplotlib\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import Bio\n", | |
| "import pandas as pd\n", | |
| "import seaborn as sns\n", | |
| "%pylab inline" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "ename": "SyntaxError", | |
| "evalue": "invalid syntax (<ipython-input-21-bd65af69f6d2>, line 1)", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[0;36m File \u001b[0;32m\"<ipython-input-21-bd65af69f6d2>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m Bio.\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "Bio." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Reference genome" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": false, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "ename": "NameError", | |
| "evalue": "name 'SeqIO' is not defined", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[0;32m<ipython-input-23-2ec80b101d48>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeqIO\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./sequence.fasta\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fasta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mseq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;31mNameError\u001b[0m: name 'SeqIO' is not defined" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "gb = SeqIO.read(\"./sequence.fasta\", \"fasta\")\n", | |
| "seq = pd.Series([i for i in gb.seq])\n", | |
| "gb" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Number of base pairs = 19k" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "len(gb)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "GC content = 41.07% GC" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "Bio.SeqUtils.GC(seq)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "seqM = pd.DataFrame({'A':np.nan, 'C':np.nan, 'G':np.nan, 'T':np.nan}, index=np.arange(len(seq)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "for i in seqM.index:\n", | |
| " letter = seq[i]\n", | |
| " seqM.xs(i)[letter] = 1" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "sns.heatmap(seqM.head(100), vmin=0, vmax=1, yticklabels=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "gc_custom = seqM\n", | |
| "gc_custom['gc'] = np.nan\n", | |
| "gc_custom['gc'][(gc_custom['G'] == 1) | (gc_custom['C'] == 1)] = 1\n", | |
| "gc_custom['gc'] = gc_custom['gc'].fillna(0)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "fig, ax = plt.subplots()\n", | |
| "ax.set_ylim(0, 1)\n", | |
| "pd.rolling_mean(gc_custom['gc'], 1000).plot(ax=ax, title='GC content of reference genome')\n", | |
| "ax.hlines(y=.5, xmin=0, xmax=19000)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## SNPs" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def type_mutation(ref, contig):\n", | |
| " if ref == 'A':\n", | |
| " print 'A'\n", | |
| " else: print ref" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "snp = pd.read_table(\"out_contigs.used_snps\", sep='\\t', names=['ref_name', 'contig_name', 'ref_position', 'ref_nt', 'contig_nt', 'contig_pos'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "snp['synon'] = np.nan" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Identify mutation type" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "snp.synon[(snp.ref_nt == 'A') & (snp.contig_nt == 'G')] = 'transition'\n", | |
| "snp.synon[(snp.ref_nt == 'G') & (snp.contig_nt == 'A')] = 'transition'\n", | |
| "snp.synon[(snp.ref_nt == 'C') & (snp.contig_nt == 'T')] = 'transition'\n", | |
| "snp.synon[(snp.ref_nt == 'T') & (snp.contig_nt == 'C')] = 'transition'\n", | |
| "\n", | |
| "snp.synon[(snp.ref_nt == 'A') & ((snp.contig_nt == 'C') | (snp.contig_nt == 'T'))] = 'transversion'\n", | |
| "snp.synon[(snp.ref_nt == 'C') & ((snp.contig_nt == 'A') | (snp.contig_nt == 'G'))] = 'transversion'\n", | |
| "snp.synon[(snp.ref_nt == 'G') & ((snp.contig_nt == 'C') | (snp.contig_nt == 'T'))] = 'transversion'\n", | |
| "snp.synon[(snp.ref_nt == 'T') & ((snp.contig_nt == 'A') | (snp.contig_nt == 'G'))] = 'transversion'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "snp.tail()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "83% are transitions (less important) and 17% are transversions (more important)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "snp.synon.value_counts(normalize=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "fig, ax = plt.subplots(figsize=(12, 2))\n", | |
| "ax.set_xlim(0, 19000)\n", | |
| "snp_binary = snp.synon\n", | |
| "snp_binary = snp_binary.replace('transition', 0)\n", | |
| "snp_binary = snp_binary.replace('transversion', 1)\n", | |
| "ax.scatter(snp.ref_position, snp_binary.values, marker='|', s=1000)\n", | |
| "ax.set_title('Location of mutations along the genome')\n", | |
| "ax.grid(False)\n", | |
| "ax.set_yticks([0, 1])\n", | |
| "ax.set_yticklabels(['transition', 'transversion'])\n", | |
| "ax.set_xlabel('Location');" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Check that nucleotide selection works as expected" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Nope, it doesn't." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def get_nucleotides(position_list, ref_genome):\n", | |
| " nucleotides = []\n", | |
| " for i in position_list:\n", | |
| " nucleotides.append(ref_genome[i+1])\n", | |
| " \n", | |
| " return nucleotides" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "snp['ref_nt_check'] = get_nucleotides(snp['ref_position'], gb.seq)\n", | |
| "snp['match'] = False\n", | |
| "snp['match'][snp.ref_nt == snp.ref_nt_check] = True\n", | |
| "snp[['ref_position', 'ref_nt', 'ref_nt_check', 'match']]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Codons" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def find_codons(genome):\n", | |
| " codons = (a[n:n+3] for n in xrange(0,len(genome),3)) # creates generator\n", | |
| "\n", | |
| " dict_codons = {}\n", | |
| "\n", | |
| " for codon in codons:\n", | |
| " if dict_codons.has_key(codon):\n", | |
| " dict_codons[codon] += 1\n", | |
| " else:\n", | |
| " dict_codons[codon] = 1\n", | |
| " \n", | |
| " return pd.Series(dict_codons, index=dict_codons.keys())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "codons = find_codons(gb)\n", | |
| "codons.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "codons.order().tail(20).plot(kind='bar', title='most common codons')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Proteins" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Translate sequence into proteins?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "complementary = gb.seq.transcribe() #should this be complement?\n", | |
| "proteins = complementary.translate()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "complementary" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Over 6k proteins - should only be like 7?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "proteins = pd.Series([i for i in proteins])\n", | |
| "len(proteins)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Leucine and serine most common\n", | |
| "\n", | |
| "http://www.bioinformatics.org/sms2/iupac.html" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "proteins.value_counts().plot(kind='bar', title='Protein frequency')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment