Created
April 10, 2015 17:54
-
-
Save cmrivers/660990e009bbf2f7a788 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
} | |
], | |
"source": [ | |
"import Bio\n", | |
"import pandas as pd\n", | |
"import seaborn as sns\n", | |
"%pylab inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"ename": "SyntaxError", | |
"evalue": "invalid syntax (<ipython-input-21-bd65af69f6d2>, line 1)", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-21-bd65af69f6d2>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m Bio.\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" | |
] | |
} | |
], | |
"source": [ | |
"Bio." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Reference genome" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"ename": "NameError", | |
"evalue": "name 'SeqIO' is not defined", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-23-2ec80b101d48>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeqIO\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./sequence.fasta\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fasta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mseq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mNameError\u001b[0m: name 'SeqIO' is not defined" | |
] | |
} | |
], | |
"source": [ | |
"gb = SeqIO.read(\"./sequence.fasta\", \"fasta\")\n", | |
"seq = pd.Series([i for i in gb.seq])\n", | |
"gb" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Number of base pairs = 19k" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"len(gb)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"GC content = 41.07% GC" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"Bio.SeqUtils.GC(seq)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"seqM = pd.DataFrame({'A':np.nan, 'C':np.nan, 'G':np.nan, 'T':np.nan}, index=np.arange(len(seq)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"for i in seqM.index:\n", | |
" letter = seq[i]\n", | |
" seqM.xs(i)[letter] = 1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"sns.heatmap(seqM.head(100), vmin=0, vmax=1, yticklabels=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"gc_custom = seqM\n", | |
"gc_custom['gc'] = np.nan\n", | |
"gc_custom['gc'][(gc_custom['G'] == 1) | (gc_custom['C'] == 1)] = 1\n", | |
"gc_custom['gc'] = gc_custom['gc'].fillna(0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, ax = plt.subplots()\n", | |
"ax.set_ylim(0, 1)\n", | |
"pd.rolling_mean(gc_custom['gc'], 1000).plot(ax=ax, title='GC content of reference genome')\n", | |
"ax.hlines(y=.5, xmin=0, xmax=19000)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## SNPs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def type_mutation(ref, contig):\n", | |
" if ref == 'A':\n", | |
" print 'A'\n", | |
" else: print ref" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"snp = pd.read_table(\"out_contigs.used_snps\", sep='\\t', names=['ref_name', 'contig_name', 'ref_position', 'ref_nt', 'contig_nt', 'contig_pos'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"snp['synon'] = np.nan" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Identify mutation type" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"snp.synon[(snp.ref_nt == 'A') & (snp.contig_nt == 'G')] = 'transition'\n", | |
"snp.synon[(snp.ref_nt == 'G') & (snp.contig_nt == 'A')] = 'transition'\n", | |
"snp.synon[(snp.ref_nt == 'C') & (snp.contig_nt == 'T')] = 'transition'\n", | |
"snp.synon[(snp.ref_nt == 'T') & (snp.contig_nt == 'C')] = 'transition'\n", | |
"\n", | |
"snp.synon[(snp.ref_nt == 'A') & ((snp.contig_nt == 'C') | (snp.contig_nt == 'T'))] = 'transversion'\n", | |
"snp.synon[(snp.ref_nt == 'C') & ((snp.contig_nt == 'A') | (snp.contig_nt == 'G'))] = 'transversion'\n", | |
"snp.synon[(snp.ref_nt == 'G') & ((snp.contig_nt == 'C') | (snp.contig_nt == 'T'))] = 'transversion'\n", | |
"snp.synon[(snp.ref_nt == 'T') & ((snp.contig_nt == 'A') | (snp.contig_nt == 'G'))] = 'transversion'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"snp.tail()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"83% are transitions (less important) and 17% are transversions (more important)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"snp.synon.value_counts(normalize=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"fig, ax = plt.subplots(figsize=(12, 2))\n", | |
"ax.set_xlim(0, 19000)\n", | |
"snp_binary = snp.synon\n", | |
"snp_binary = snp_binary.replace('transition', 0)\n", | |
"snp_binary = snp_binary.replace('transversion', 1)\n", | |
"ax.scatter(snp.ref_position, snp_binary.values, marker='|', s=1000)\n", | |
"ax.set_title('Location of mutations along the genome')\n", | |
"ax.grid(False)\n", | |
"ax.set_yticks([0, 1])\n", | |
"ax.set_yticklabels(['transition', 'transversion'])\n", | |
"ax.set_xlabel('Location');" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Check that nucleotide selection works as expected" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Nope, it doesn't." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_nucleotides(position_list, ref_genome):\n", | |
" nucleotides = []\n", | |
" for i in position_list:\n", | |
" nucleotides.append(ref_genome[i+1])\n", | |
" \n", | |
" return nucleotides" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"snp['ref_nt_check'] = get_nucleotides(snp['ref_position'], gb.seq)\n", | |
"snp['match'] = False\n", | |
"snp['match'][snp.ref_nt == snp.ref_nt_check] = True\n", | |
"snp[['ref_position', 'ref_nt', 'ref_nt_check', 'match']]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Codons" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def find_codons(genome):\n", | |
" codons = (a[n:n+3] for n in xrange(0,len(genome),3)) # creates generator\n", | |
"\n", | |
" dict_codons = {}\n", | |
"\n", | |
" for codon in codons:\n", | |
" if dict_codons.has_key(codon):\n", | |
" dict_codons[codon] += 1\n", | |
" else:\n", | |
" dict_codons[codon] = 1\n", | |
" \n", | |
" return pd.Series(dict_codons, index=dict_codons.keys())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"codons = find_codons(gb)\n", | |
"codons.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"codons.order().tail(20).plot(kind='bar', title='most common codons')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Proteins" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Translate sequence into proteins?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"complementary = gb.seq.transcribe() #should this be complement?\n", | |
"proteins = complementary.translate()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"complementary" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Over 6k proteins - should only be like 7?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"proteins = pd.Series([i for i in proteins])\n", | |
"len(proteins)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Leucine and serine most common\n", | |
"\n", | |
"http://www.bioinformatics.org/sms2/iupac.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"proteins.value_counts().plot(kind='bar', title='Protein frequency')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment