dwinter · February 20, 2015 21:46
diff --git a/Annotation.ipynb b/Annotation.ipynb
 {
 "metadata": {
  "name": "",
  "signature": "sha256:91ecd3617657bf1deb759610599bf558a56f84c456d0c81369c5ed1889f9cef5"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#Annotation of galaxiid transciptromes. \n",
      "\n",
      "Directory structure is \n",
      "```\n",
      "\u251c\u2500\u2500 Annotation.ipynb         #this notebook\n",
      "\u251c\u2500\u2500 BLAST_dbs                #blastDBs\n",
      "\u2514\u2500\u2500 seqs                     \n",
      "    \u251c\u2500\u2500 brev_all_contigs.fasta #brevipinis 454 contigs\n",
      "    \u251c\u2500\u2500 dep_all_contigs.fasta  #depreciceps 454 contigs\n",
      "    \u2514\u2500\u2500 uniprot_sprot.fasta    #swisprot\n",
      "```"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "##Make a Blast DB"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "! makeblastdb -in seqs/uniprot_sprot.fasta -title swissprot -out BLAST_dbs/swissprot -dbtype prot\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "##BLAST each transctriptome\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "! blastx -query seqs/dep_all_contigs.fasta -db BLAST_dbs/swissprot -out dep_swissprot -outfmt 5 -evalue 1e-6 -num_alignments 10\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "! blastx -query seqs/brev_all_contigs.fasta -db BLAST_dbs/swissprot -out brev_swissprot -outfmt 5 -evalue 1e-6 -num_alignments 10"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "##Annotations\n",
      "\n",
      "Mainly, these annotations are going to be used with `BLAST2go` but we will also use them to help us find orthologs in the dNdS sections. So we will record homologues for each gene in SeqRecords, which we'll store in a `BioSQL` database\n",
      "\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from Bio.Blast import NCBIXML\n",
      "from Bio import SeqIO"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def get_sp_id(rec):\n",
      "    \"\"\"Retrieve sequence ID from CLCBio contig\"\"\"\n",
      "    return([a.hit_def.split(\"|\")[1].encode(\"ascii\") for a in rec.alignments])\n",
      "\n",
      "def annotate_transcripts(seq_file, blast_file):\n",
      "    \"\"\"Add best-hit annotation to each of a set on contigs\n",
      "    \n",
      "    seq_file  is a fasta formatted file of contigs\n",
      "    bast_file is an xml formatted file resulting from blasting seq_file against\n",
      "    an annotation database\n",
      "    \n",
      "    Returns a list of SeqRecord objects with a new annotation filed (hard-coded to be\n",
      "    called swissprot), with the id of the top-hit in the xml as a value.\n",
      "    \"\"\"\n",
      "    blast_recs = NCBIXML.parse(open(blast_file))\n",
      "    seq_dict = SeqIO.to_dict(SeqIO.parse(seq_file, \"fasta\"))\n",
      "    annotated = 0\n",
      "    for rec in blast_recs:\n",
      "        query, hits = rec.query.split(\" \")[0], get_sp_id(rec)\n",
      "        if hits:\n",
      "            seq_dict[query].annotations[\"swissprot\"] = hits[0]\n",
      "            annotated += 1\n",
      "        else:\n",
      "            seq_dict[query].annotations[\"swissprot\"] = \"NA\"\n",
      "            \n",
      "    print \"annotated {0} sequences\".format(annotated)\n",
      "    return(seq_dict.values())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 65
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "Create lists of `SeqRecord` objects with those annotations:"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "dep_annotated = annotate_transcripts(\"seqs/dep_all_contigs.fasta\", \"BLAST_results/dep_v_swissprot.xml\")\n",
      "brev_annotated = annotate_transcripts(\"seqs/brev_all_contigs.fasta\", \"BLAST_results/brev_v_swissprot.xml\")"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "annotated 3723 sequences\n",
        "annotated 7525 sequences"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 66
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "##Load annotations into a database for later use\n",
      "\n",
      "We could write those annotations to file (or even to a table), but I wanted to use a database later on . \n",
      "The lines below will only work if a sqllite database called BioSql is living in `galaxiids.sql` living in `../db/`. "
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from BioSQL import BioSeqDatabase\n",
      "server = BioSeqDatabase.open_database(driver=\"sqlite\", user=\"david\", db=\"../db/galaxiids.sqlite\")\n",
      "\n",
      "server.new_database(\"brev_sp\", description=\"Brev annotated with swissprot\")\n",
      "server.new_database(\"dep_sp\", description=\"dep annotated with swissprot\")\n",
      "server.commit()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 67
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from BioSQL import BioSeqDatabase\n",
      "\n",
      "db = server[\"brev_sp\"]\n",
      "print \"loaded {0} sequences from brev into a database\".format(db.load(brev_annotated))\n",
      "server.commit()\n",
      "\n",
      "db = server[\"dep_sp\"]\n",
      "print \"loaded {0} sequneces from dep into a database\".format(db.load(dep_annotated))\n",
      "server.commit()\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "loaded 20197 sequences from brev into a database\n",
        "loaded 11955 sequneces from dep into a database"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 68
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "#ORF annotation\n",
      "Looking for orfs. BLAST each transcriptome againt slamon, trout, cichlid and zebrafish genomes. The BLASTing takes some time....\n"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "! blastx -query seqs/brev_all_contigs.fasta -db BLAST_dbs/salmon -out brev_salmon -outfmt 5 -evalue 1e-6 -num_alignments 10"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "! blastx -query seqs/dep_all_contigs.fasta -db BLAST_dbs/salmon -out dep_salmon -outfmt 5 -evalue 1e-6 -num_alignments 10"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "(repeated for every species' protein databse, which I did via commandline)"
     ]
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Annotating orfs with `orfinder`"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "! findorf join --ref seqs/brev_all_contigs.fasta salmon:BLAST_results/brev_salmon.xml trout:BLAST_results/brev_trout.xml chichlid:BLAST_results/brev_cichlid.xml zebra:BLAST_results/brev_zebra.xml\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[join] loading all contig sequences...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n",
        "[join] processing BLAST file 'chichlid'...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n",
        "[join] processing BLAST file 'salmon'...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n",
        "[join] processing BLAST file 'zebra'...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n",
        "[join] processing BLAST file 'trout'...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "! findorf predict --orf seqs/brev_orfs.fasta \n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[predict] loading contig objects..."
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\tdone.\r\n",
        "[predict] predicting and annotating ORFs.\r\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[predict] writing nucleotide sequences..."
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\tdone.\r\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "! findorf join --ref seqs/dep_all_contigs.fasta --output dep_joined.pkl salmon:BLAST_results/dep_salmon.xml trout:BLAST_results/dep_trout.xml chichlid:BLAST_results/dep_cichlid.xml zebra:BLAST_results/dep_zebra.xml\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[join] loading all contig sequences...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[join] processing BLAST file 'chichlid'...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n",
        "[join] processing BLAST file 'salmon'...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n",
        "[join] processing BLAST file 'zebra'...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n",
        "[join] processing BLAST file 'trout'...\t"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "done.\r\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "! findorf predict --input dep_joined.pkl --orf seqs/dep_orfs.fasta "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[predict] loading contig objects..."
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\tdone.\r\n",
        "[predict] predicting and annotating ORFs.\r\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[predict] writing nucleotide sequences..."
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\tdone.\r\n"
       ]
      }
     ],
     "prompt_number": 11
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "##how many did we get?"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "b = len(list(SeqIO.parse(\"seqs/brev_orfs.fasta\", \"fasta\"))) \n",
      "d = len(list(SeqIO.parse(\"seqs/dep_orfs.fasta\", \"fasta\"))) \n",
      "\n",
      "print b, b/20197.0\n",
      "print d, d/11955.0"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "8727"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " 0.432093875328\n",
        "4435 0.370974487662\n"
       ]
      }
     ],
     "prompt_number": 13
    }
   ],
   "metadata": {}
  }
 ]
 }
	{
	"metadata": {
	"name": "",
	"signature": "sha256:91ecd3617657bf1deb759610599bf558a56f84c456d0c81369c5ed1889f9cef5"
	},
	"nbformat": 3,
	"nbformat_minor": 0,
	"worksheets": [
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#Annotation of galaxiid transciptromes. \n",
	"\n",
	"Directory structure is \n",
	"```\n",
	"\u251c\u2500\u2500 Annotation.ipynb #this notebook\n",
	"\u251c\u2500\u2500 BLAST_dbs #blastDBs\n",
	"\u2514\u2500\u2500 seqs \n",
	" \u251c\u2500\u2500 brev_all_contigs.fasta #brevipinis 454 contigs\n",
	" \u251c\u2500\u2500 dep_all_contigs.fasta #depreciceps 454 contigs\n",
	" \u2514\u2500\u2500 uniprot_sprot.fasta #swisprot\n",
	"```"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"##Make a Blast DB"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"! makeblastdb -in seqs/uniprot_sprot.fasta -title swissprot -out BLAST_dbs/swissprot -dbtype prot\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"##BLAST each transctriptome\n"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"! blastx -query seqs/dep_all_contigs.fasta -db BLAST_dbs/swissprot -out dep_swissprot -outfmt 5 -evalue 1e-6 -num_alignments 10\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"! blastx -query seqs/brev_all_contigs.fasta -db BLAST_dbs/swissprot -out brev_swissprot -outfmt 5 -evalue 1e-6 -num_alignments 10"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"##Annotations\n",
	"\n",
	"Mainly, these annotations are going to be used with `BLAST2go` but we will also use them to help us find orthologs in the dNdS sections. So we will record homologues for each gene in SeqRecords, which we'll store in a `BioSQL` database\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from Bio.Blast import NCBIXML\n",
	"from Bio import SeqIO"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 7
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"def get_sp_id(rec):\n",
	" \"\"\"Retrieve sequence ID from CLCBio contig\"\"\"\n",
	" return([a.hit_def.split(\"\|\")[1].encode(\"ascii\") for a in rec.alignments])\n",
	"\n",
	"def annotate_transcripts(seq_file, blast_file):\n",
	" \"\"\"Add best-hit annotation to each of a set on contigs\n",
	" \n",
	" seq_file is a fasta formatted file of contigs\n",
	" bast_file is an xml formatted file resulting from blasting seq_file against\n",
	" an annotation database\n",
	" \n",
	" Returns a list of SeqRecord objects with a new annotation filed (hard-coded to be\n",
	" called swissprot), with the id of the top-hit in the xml as a value.\n",
	" \"\"\"\n",
	" blast_recs = NCBIXML.parse(open(blast_file))\n",
	" seq_dict = SeqIO.to_dict(SeqIO.parse(seq_file, \"fasta\"))\n",
	" annotated = 0\n",
	" for rec in blast_recs:\n",
	" query, hits = rec.query.split(\" \")[0], get_sp_id(rec)\n",
	" if hits:\n",
	" seq_dict[query].annotations[\"swissprot\"] = hits[0]\n",
	" annotated += 1\n",
	" else:\n",
	" seq_dict[query].annotations[\"swissprot\"] = \"NA\"\n",
	" \n",
	" print \"annotated {0} sequences\".format(annotated)\n",
	" return(seq_dict.values())"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 65
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Create lists of `SeqRecord` objects with those annotations:"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"dep_annotated = annotate_transcripts(\"seqs/dep_all_contigs.fasta\", \"BLAST_results/dep_v_swissprot.xml\")\n",
	"brev_annotated = annotate_transcripts(\"seqs/brev_all_contigs.fasta\", \"BLAST_results/brev_v_swissprot.xml\")"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"annotated 3723 sequences\n",
	"annotated 7525 sequences"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n"
	]
	}
	],
	"prompt_number": 66
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"##Load annotations into a database for later use\n",
	"\n",
	"We could write those annotations to file (or even to a table), but I wanted to use a database later on . \n",
	"The lines below will only work if a sqllite database called BioSql is living in `galaxiids.sql` living in `../db/`. "
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from BioSQL import BioSeqDatabase\n",
	"server = BioSeqDatabase.open_database(driver=\"sqlite\", user=\"david\", db=\"../db/galaxiids.sqlite\")\n",
	"\n",
	"server.new_database(\"brev_sp\", description=\"Brev annotated with swissprot\")\n",
	"server.new_database(\"dep_sp\", description=\"dep annotated with swissprot\")\n",
	"server.commit()"
	],
	"language": "python",
	"metadata": {},
	"outputs": [],
	"prompt_number": 67
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"from BioSQL import BioSeqDatabase\n",
	"\n",
	"db = server[\"brev_sp\"]\n",
	"print \"loaded {0} sequences from brev into a database\".format(db.load(brev_annotated))\n",
	"server.commit()\n",
	"\n",
	"db = server[\"dep_sp\"]\n",
	"print \"loaded {0} sequneces from dep into a database\".format(db.load(dep_annotated))\n",
	"server.commit()\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"loaded 20197 sequences from brev into a database\n",
	"loaded 11955 sequneces from dep into a database"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\n"
	]
	}
	],
	"prompt_number": 68
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"#ORF annotation\n",
	"Looking for orfs. BLAST each transcriptome againt slamon, trout, cichlid and zebrafish genomes. The BLASTing takes some time....\n"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"! blastx -query seqs/brev_all_contigs.fasta -db BLAST_dbs/salmon -out brev_salmon -outfmt 5 -evalue 1e-6 -num_alignments 10"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"! blastx -query seqs/dep_all_contigs.fasta -db BLAST_dbs/salmon -out dep_salmon -outfmt 5 -evalue 1e-6 -num_alignments 10"
	],
	"language": "python",
	"metadata": {},
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"(repeated for every species' protein databse, which I did via commandline)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Annotating orfs with `orfinder`"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"! findorf join --ref seqs/brev_all_contigs.fasta salmon:BLAST_results/brev_salmon.xml trout:BLAST_results/brev_trout.xml chichlid:BLAST_results/brev_cichlid.xml zebra:BLAST_results/brev_zebra.xml\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[join] loading all contig sequences...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n",
	"[join] processing BLAST file 'chichlid'...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n",
	"[join] processing BLAST file 'salmon'...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n",
	"[join] processing BLAST file 'zebra'...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n",
	"[join] processing BLAST file 'trout'...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n"
	]
	}
	],
	"prompt_number": 2
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"! findorf predict --orf seqs/brev_orfs.fasta \n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[predict] loading contig objects..."
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\tdone.\r\n",
	"[predict] predicting and annotating ORFs.\r\n"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[predict] writing nucleotide sequences..."
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\tdone.\r\n"
	]
	}
	],
	"prompt_number": 3
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"! findorf join --ref seqs/dep_all_contigs.fasta --output dep_joined.pkl salmon:BLAST_results/dep_salmon.xml trout:BLAST_results/dep_trout.xml chichlid:BLAST_results/dep_cichlid.xml zebra:BLAST_results/dep_zebra.xml\n"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[join] loading all contig sequences...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[join] processing BLAST file 'chichlid'...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n",
	"[join] processing BLAST file 'salmon'...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n",
	"[join] processing BLAST file 'zebra'...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n",
	"[join] processing BLAST file 'trout'...\t"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"done.\r\n"
	]
	}
	],
	"prompt_number": 4
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"! findorf predict --input dep_joined.pkl --orf seqs/dep_orfs.fasta "
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[predict] loading contig objects..."
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\tdone.\r\n",
	"[predict] predicting and annotating ORFs.\r\n"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"[predict] writing nucleotide sequences..."
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"\tdone.\r\n"
	]
	}
	],
	"prompt_number": 11
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"##how many did we get?"
	]
	},
	{
	"cell_type": "code",
	"collapsed": false,
	"input": [
	"b = len(list(SeqIO.parse(\"seqs/brev_orfs.fasta\", \"fasta\"))) \n",
	"d = len(list(SeqIO.parse(\"seqs/dep_orfs.fasta\", \"fasta\"))) \n",
	"\n",
	"print b, b/20197.0\n",
	"print d, d/11955.0"
	],
	"language": "python",
	"metadata": {},
	"outputs": [
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	"8727"
	]
	},
	{
	"output_type": "stream",
	"stream": "stdout",
	"text": [
	" 0.432093875328\n",
	"4435 0.370974487662\n"
	]
	}
	],
	"prompt_number": 13
	}
	],
	"metadata": {}
	}
	]
	}