cmrivers · April 10, 2015 17:54
diff --git a/genome b/genome
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "source": [
    "import Bio\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "%pylab inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "SyntaxError",
     "evalue": "invalid syntax (<ipython-input-21-bd65af69f6d2>, line 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-21-bd65af69f6d2>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m    Bio.\u001b[0m\n\u001b[0m        ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
     ]
    }
   ],
   "source": [
    "Bio."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reference genome"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'SeqIO' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-23-2ec80b101d48>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeqIO\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./sequence.fasta\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fasta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mseq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'SeqIO' is not defined"
     ]
    }
   ],
   "source": [
    "gb = SeqIO.read(\"./sequence.fasta\", \"fasta\")\n",
    "seq = pd.Series([i for i in gb.seq])\n",
    "gb"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Number of base pairs = 19k"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "len(gb)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "GC content = 41.07% GC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Bio.SeqUtils.GC(seq)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "seqM = pd.DataFrame({'A':np.nan, 'C':np.nan, 'G':np.nan, 'T':np.nan}, index=np.arange(len(seq)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for i in seqM.index:\n",
    "    letter = seq[i]\n",
    "    seqM.xs(i)[letter] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sns.heatmap(seqM.head(100), vmin=0, vmax=1, yticklabels=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "gc_custom = seqM\n",
    "gc_custom['gc'] = np.nan\n",
    "gc_custom['gc'][(gc_custom['G'] == 1) | (gc_custom['C'] == 1)] = 1\n",
    "gc_custom['gc'] = gc_custom['gc'].fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots()\n",
    "ax.set_ylim(0, 1)\n",
    "pd.rolling_mean(gc_custom['gc'], 1000).plot(ax=ax, title='GC content of reference genome')\n",
    "ax.hlines(y=.5, xmin=0, xmax=19000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SNPs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def type_mutation(ref, contig):\n",
    "    if ref == 'A':\n",
    "        print 'A'\n",
    "    else: print ref"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "snp = pd.read_table(\"out_contigs.used_snps\", sep='\\t', names=['ref_name', 'contig_name', 'ref_position', 'ref_nt', 'contig_nt', 'contig_pos'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "snp['synon'] = np.nan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Identify mutation type"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "snp.synon[(snp.ref_nt == 'A') & (snp.contig_nt == 'G')] = 'transition'\n",
    "snp.synon[(snp.ref_nt == 'G') & (snp.contig_nt == 'A')] = 'transition'\n",
    "snp.synon[(snp.ref_nt == 'C') & (snp.contig_nt == 'T')] = 'transition'\n",
    "snp.synon[(snp.ref_nt == 'T') & (snp.contig_nt == 'C')] = 'transition'\n",
    "\n",
    "snp.synon[(snp.ref_nt == 'A') & ((snp.contig_nt == 'C') | (snp.contig_nt == 'T'))] = 'transversion'\n",
    "snp.synon[(snp.ref_nt == 'C') & ((snp.contig_nt == 'A') | (snp.contig_nt == 'G'))] = 'transversion'\n",
    "snp.synon[(snp.ref_nt == 'G') & ((snp.contig_nt == 'C') | (snp.contig_nt == 'T'))] = 'transversion'\n",
    "snp.synon[(snp.ref_nt == 'T') & ((snp.contig_nt == 'A') | (snp.contig_nt == 'G'))] = 'transversion'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "snp.tail()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "83% are transitions (less important) and 17% are transversions (more important)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "snp.synon.value_counts(normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(figsize=(12, 2))\n",
    "ax.set_xlim(0, 19000)\n",
    "snp_binary = snp.synon\n",
    "snp_binary = snp_binary.replace('transition', 0)\n",
    "snp_binary = snp_binary.replace('transversion', 1)\n",
    "ax.scatter(snp.ref_position, snp_binary.values, marker='|', s=1000)\n",
    "ax.set_title('Location of mutations along the genome')\n",
    "ax.grid(False)\n",
    "ax.set_yticks([0, 1])\n",
    "ax.set_yticklabels(['transition', 'transversion'])\n",
    "ax.set_xlabel('Location');"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check that nucleotide selection works as expected"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Nope, it doesn't."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def get_nucleotides(position_list, ref_genome):\n",
    "    nucleotides = []\n",
    "    for i in position_list:\n",
    "        nucleotides.append(ref_genome[i+1])\n",
    "    \n",
    "    return nucleotides"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "snp['ref_nt_check'] = get_nucleotides(snp['ref_position'], gb.seq)\n",
    "snp['match'] = False\n",
    "snp['match'][snp.ref_nt == snp.ref_nt_check] = True\n",
    "snp[['ref_position', 'ref_nt', 'ref_nt_check', 'match']]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Codons"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def find_codons(genome):\n",
    "    codons = (a[n:n+3] for n in xrange(0,len(genome),3)) # creates generator\n",
    "\n",
    "    dict_codons = {}\n",
    "\n",
    "    for codon in codons:\n",
    "        if dict_codons.has_key(codon):\n",
    "            dict_codons[codon] += 1\n",
    "        else:\n",
    "            dict_codons[codon] = 1\n",
    "            \n",
    "    return pd.Series(dict_codons, index=dict_codons.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "codons = find_codons(gb)\n",
    "codons.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "codons.order().tail(20).plot(kind='bar', title='most common codons')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Proteins"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Translate sequence into proteins?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "complementary = gb.seq.transcribe() #should this be complement?\n",
    "proteins = complementary.translate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "complementary"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Over 6k proteins - should only be like 7?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "proteins = pd.Series([i for i in proteins])\n",
    "len(proteins)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Leucine and serine most common\n",
    "\n",
    "http://www.bioinformatics.org/sms2/iupac.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "proteins.value_counts().plot(kind='bar', title='Protein frequency')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Populating the interactive namespace from numpy and matplotlib\n"
	]
	}
	],
	"source": [
	"import Bio\n",
	"import pandas as pd\n",
	"import seaborn as sns\n",
	"%pylab inline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"ename": "SyntaxError",
	"evalue": "invalid syntax (<ipython-input-21-bd65af69f6d2>, line 1)",
	"output_type": "error",
	"traceback": [
	"\u001b[0;36m File \u001b[0;32m\"<ipython-input-21-bd65af69f6d2>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m Bio.\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
	]
	}
	],
	"source": [
	"Bio."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Reference genome"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"ename": "NameError",
	"evalue": "name 'SeqIO' is not defined",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-23-2ec80b101d48>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeqIO\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./sequence.fasta\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fasta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mseq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mNameError\u001b[0m: name 'SeqIO' is not defined"
	]
	}
	],
	"source": [
	"gb = SeqIO.read(\"./sequence.fasta\", \"fasta\")\n",
	"seq = pd.Series([i for i in gb.seq])\n",
	"gb"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Number of base pairs = 19k"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"len(gb)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"GC content = 41.07% GC"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"Bio.SeqUtils.GC(seq)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"seqM = pd.DataFrame({'A':np.nan, 'C':np.nan, 'G':np.nan, 'T':np.nan}, index=np.arange(len(seq)))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"for i in seqM.index:\n",
	" letter = seq[i]\n",
	" seqM.xs(i)[letter] = 1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"sns.heatmap(seqM.head(100), vmin=0, vmax=1, yticklabels=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"gc_custom = seqM\n",
	"gc_custom['gc'] = np.nan\n",
	"gc_custom['gc'][(gc_custom['G'] == 1) \| (gc_custom['C'] == 1)] = 1\n",
	"gc_custom['gc'] = gc_custom['gc'].fillna(0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"fig, ax = plt.subplots()\n",
	"ax.set_ylim(0, 1)\n",
	"pd.rolling_mean(gc_custom['gc'], 1000).plot(ax=ax, title='GC content of reference genome')\n",
	"ax.hlines(y=.5, xmin=0, xmax=19000)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## SNPs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def type_mutation(ref, contig):\n",
	" if ref == 'A':\n",
	" print 'A'\n",
	" else: print ref"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"snp = pd.read_table(\"out_contigs.used_snps\", sep='\\t', names=['ref_name', 'contig_name', 'ref_position', 'ref_nt', 'contig_nt', 'contig_pos'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"snp['synon'] = np.nan"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Identify mutation type"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"snp.synon[(snp.ref_nt == 'A') & (snp.contig_nt == 'G')] = 'transition'\n",
	"snp.synon[(snp.ref_nt == 'G') & (snp.contig_nt == 'A')] = 'transition'\n",
	"snp.synon[(snp.ref_nt == 'C') & (snp.contig_nt == 'T')] = 'transition'\n",
	"snp.synon[(snp.ref_nt == 'T') & (snp.contig_nt == 'C')] = 'transition'\n",
	"\n",
	"snp.synon[(snp.ref_nt == 'A') & ((snp.contig_nt == 'C') \| (snp.contig_nt == 'T'))] = 'transversion'\n",
	"snp.synon[(snp.ref_nt == 'C') & ((snp.contig_nt == 'A') \| (snp.contig_nt == 'G'))] = 'transversion'\n",
	"snp.synon[(snp.ref_nt == 'G') & ((snp.contig_nt == 'C') \| (snp.contig_nt == 'T'))] = 'transversion'\n",
	"snp.synon[(snp.ref_nt == 'T') & ((snp.contig_nt == 'A') \| (snp.contig_nt == 'G'))] = 'transversion'"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"snp.tail()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"83% are transitions (less important) and 17% are transversions (more important)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"snp.synon.value_counts(normalize=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"fig, ax = plt.subplots(figsize=(12, 2))\n",
	"ax.set_xlim(0, 19000)\n",
	"snp_binary = snp.synon\n",
	"snp_binary = snp_binary.replace('transition', 0)\n",
	"snp_binary = snp_binary.replace('transversion', 1)\n",
	"ax.scatter(snp.ref_position, snp_binary.values, marker='\|', s=1000)\n",
	"ax.set_title('Location of mutations along the genome')\n",
	"ax.grid(False)\n",
	"ax.set_yticks([0, 1])\n",
	"ax.set_yticklabels(['transition', 'transversion'])\n",
	"ax.set_xlabel('Location');"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Check that nucleotide selection works as expected"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Nope, it doesn't."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def get_nucleotides(position_list, ref_genome):\n",
	" nucleotides = []\n",
	" for i in position_list:\n",
	" nucleotides.append(ref_genome[i+1])\n",
	" \n",
	" return nucleotides"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"snp['ref_nt_check'] = get_nucleotides(snp['ref_position'], gb.seq)\n",
	"snp['match'] = False\n",
	"snp['match'][snp.ref_nt == snp.ref_nt_check] = True\n",
	"snp[['ref_position', 'ref_nt', 'ref_nt_check', 'match']]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Codons"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def find_codons(genome):\n",
	" codons = (a[n:n+3] for n in xrange(0,len(genome),3)) # creates generator\n",
	"\n",
	" dict_codons = {}\n",
	"\n",
	" for codon in codons:\n",
	" if dict_codons.has_key(codon):\n",
	" dict_codons[codon] += 1\n",
	" else:\n",
	" dict_codons[codon] = 1\n",
	" \n",
	" return pd.Series(dict_codons, index=dict_codons.keys())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"codons = find_codons(gb)\n",
	"codons.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"codons.order().tail(20).plot(kind='bar', title='most common codons')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Proteins"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Translate sequence into proteins?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"complementary = gb.seq.transcribe() #should this be complement?\n",
	"proteins = complementary.translate()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"complementary"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Over 6k proteins - should only be like 7?"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"proteins = pd.Series([i for i in proteins])\n",
	"len(proteins)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Leucine and serine most common\n",
	"\n",
	"http://www.bioinformatics.org/sms2/iupac.html"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"proteins.value_counts().plot(kind='bar', title='Protein frequency')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}