Skip to content

Instantly share code, notes, and snippets.

@cmrivers
Created April 10, 2015 17:54
Show Gist options
  • Save cmrivers/660990e009bbf2f7a788 to your computer and use it in GitHub Desktop.
Save cmrivers/660990e009bbf2f7a788 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"import Bio\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-21-bd65af69f6d2>, line 1)",
"output_type": "error",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-21-bd65af69f6d2>\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m Bio.\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"Bio."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Reference genome"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'SeqIO' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-23-2ec80b101d48>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mgb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSeqIO\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"./sequence.fasta\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"fasta\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mseq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mSeries\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mgb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'SeqIO' is not defined"
]
}
],
"source": [
"gb = SeqIO.read(\"./sequence.fasta\", \"fasta\")\n",
"seq = pd.Series([i for i in gb.seq])\n",
"gb"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of base pairs = 19k"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"len(gb)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"GC content = 41.07% GC"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"Bio.SeqUtils.GC(seq)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"seqM = pd.DataFrame({'A':np.nan, 'C':np.nan, 'G':np.nan, 'T':np.nan}, index=np.arange(len(seq)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"for i in seqM.index:\n",
" letter = seq[i]\n",
" seqM.xs(i)[letter] = 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"sns.heatmap(seqM.head(100), vmin=0, vmax=1, yticklabels=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"gc_custom = seqM\n",
"gc_custom['gc'] = np.nan\n",
"gc_custom['gc'][(gc_custom['G'] == 1) | (gc_custom['C'] == 1)] = 1\n",
"gc_custom['gc'] = gc_custom['gc'].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"fig, ax = plt.subplots()\n",
"ax.set_ylim(0, 1)\n",
"pd.rolling_mean(gc_custom['gc'], 1000).plot(ax=ax, title='GC content of reference genome')\n",
"ax.hlines(y=.5, xmin=0, xmax=19000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SNPs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def type_mutation(ref, contig):\n",
" if ref == 'A':\n",
" print 'A'\n",
" else: print ref"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"snp = pd.read_table(\"out_contigs.used_snps\", sep='\\t', names=['ref_name', 'contig_name', 'ref_position', 'ref_nt', 'contig_nt', 'contig_pos'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"snp['synon'] = np.nan"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Identify mutation type"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"snp.synon[(snp.ref_nt == 'A') & (snp.contig_nt == 'G')] = 'transition'\n",
"snp.synon[(snp.ref_nt == 'G') & (snp.contig_nt == 'A')] = 'transition'\n",
"snp.synon[(snp.ref_nt == 'C') & (snp.contig_nt == 'T')] = 'transition'\n",
"snp.synon[(snp.ref_nt == 'T') & (snp.contig_nt == 'C')] = 'transition'\n",
"\n",
"snp.synon[(snp.ref_nt == 'A') & ((snp.contig_nt == 'C') | (snp.contig_nt == 'T'))] = 'transversion'\n",
"snp.synon[(snp.ref_nt == 'C') & ((snp.contig_nt == 'A') | (snp.contig_nt == 'G'))] = 'transversion'\n",
"snp.synon[(snp.ref_nt == 'G') & ((snp.contig_nt == 'C') | (snp.contig_nt == 'T'))] = 'transversion'\n",
"snp.synon[(snp.ref_nt == 'T') & ((snp.contig_nt == 'A') | (snp.contig_nt == 'G'))] = 'transversion'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"snp.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"83% are transitions (less important) and 17% are transversions (more important)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"snp.synon.value_counts(normalize=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(figsize=(12, 2))\n",
"ax.set_xlim(0, 19000)\n",
"snp_binary = snp.synon\n",
"snp_binary = snp_binary.replace('transition', 0)\n",
"snp_binary = snp_binary.replace('transversion', 1)\n",
"ax.scatter(snp.ref_position, snp_binary.values, marker='|', s=1000)\n",
"ax.set_title('Location of mutations along the genome')\n",
"ax.grid(False)\n",
"ax.set_yticks([0, 1])\n",
"ax.set_yticklabels(['transition', 'transversion'])\n",
"ax.set_xlabel('Location');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Check that nucleotide selection works as expected"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Nope, it doesn't."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def get_nucleotides(position_list, ref_genome):\n",
" nucleotides = []\n",
" for i in position_list:\n",
" nucleotides.append(ref_genome[i+1])\n",
" \n",
" return nucleotides"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"snp['ref_nt_check'] = get_nucleotides(snp['ref_position'], gb.seq)\n",
"snp['match'] = False\n",
"snp['match'][snp.ref_nt == snp.ref_nt_check] = True\n",
"snp[['ref_position', 'ref_nt', 'ref_nt_check', 'match']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Codons"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def find_codons(genome):\n",
" codons = (a[n:n+3] for n in xrange(0,len(genome),3)) # creates generator\n",
"\n",
" dict_codons = {}\n",
"\n",
" for codon in codons:\n",
" if dict_codons.has_key(codon):\n",
" dict_codons[codon] += 1\n",
" else:\n",
" dict_codons[codon] = 1\n",
" \n",
" return pd.Series(dict_codons, index=dict_codons.keys())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"codons = find_codons(gb)\n",
"codons.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"codons.order().tail(20).plot(kind='bar', title='most common codons')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Proteins"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Translate sequence into proteins?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"complementary = gb.seq.transcribe() #should this be complement?\n",
"proteins = complementary.translate()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"complementary"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Over 6k proteins - should only be like 7?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"proteins = pd.Series([i for i in proteins])\n",
"len(proteins)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Leucine and serine most common\n",
"\n",
"http://www.bioinformatics.org/sms2/iupac.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"proteins.value_counts().plot(kind='bar', title='Protein frequency')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.9"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment