Skip to content

Instantly share code, notes, and snippets.

@gregcaporaso
Created October 3, 2012 02:23
Show Gist options
  • Select an option

  • Save gregcaporaso/3824571 to your computer and use it in GitHub Desktop.

Select an option

Save gregcaporaso/3824571 to your computer and use it in GitHub Desktop.
Comparison of several features of tools for mapping DNA reads to a DNA reference database
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "mapper-comparison-notes"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"def summarize_observation_tables(biom_fp_glob):\n",
" fps = glob(biom_fp_glob)\n",
" for fp in fps:\n",
" print fp, \" \", \n",
" !per_library_stats.py -i $fp | grep observation\n",
" !per_library_stats.py -i $fp | grep otu"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"from os.path import join\n",
"from glob import glob\n",
"\n",
"working_dir = \"/Users/caporaso/outbox/mapper_comparisons/ipynb/\"\n",
"input_seqs_fp = join(working_dir,\"seqs.fna\")\n",
"reference_seqs_fp = \"/Users/caporaso/data/gg_otus_4feb2011/rep_set/gg_97_otus_4feb2011.fasta\""
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!count_seqs.py -i $input_seqs_fp"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"91000 : /Users/caporaso/outbox/mapper_comparisons/ipynb/seqs.fna (Sequence lengths (mean +/- std): 1389.3360 +/- 52.7229)\r\n",
"91000 : Total\r\n"
]
}
],
"prompt_number": 11
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"uclust-fast (closed-reference, uclust defaults)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"97% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time pick_otus.py -i $input_seqs_fp -r $reference_seqs_fp --max_accepts 1 --max_rejects 8 --stepwords 8 --word_length 8 -m uclust_ref -C -o uclust-fast_mapped-0.97_v97/ -s 0.97\n",
"!time make_otu_table.py -i uclust-fast_mapped-0.97_v97/seqs_otus.txt -o uclust-fast_mapped-0.97_v97/observation_table.biom"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"real\t2m0.192s\r\n",
"user\t1m59.011s\r\n",
"sys\t0m1.140s\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"real\t0m0.246s\r\n",
"user\t0m0.189s\r\n",
"sys\t0m0.056s\r\n"
]
}
],
"prompt_number": 12
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"94% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time pick_otus.py -i $input_seqs_fp -r $reference_seqs_fp --max_accepts 1 --max_rejects 8 --stepwords 8 --word_length 8 -m uclust_ref -C -o uclust-fast_mapped-0.94_v97/ -s 0.94\n",
"!time make_otu_table.py -i uclust-fast_mapped-0.94_v97/seqs_otus.txt -o uclust-fast_mapped-0.94_v97/observation_table.biom"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"real\t3m11.893s\r\n",
"user\t3m10.740s\r\n",
"sys\t0m1.119s\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"real\t0m0.280s\r\n",
"user\t0m0.228s\r\n",
"sys\t0m0.050s\r\n"
]
}
],
"prompt_number": 13
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"91% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time pick_otus.py -i $input_seqs_fp -r $reference_seqs_fp --max_accepts 1 --max_rejects 8 --stepwords 8 --word_length 8 -m uclust_ref -C -o uclust-fast_mapped-0.91_v97/ -s 0.91\n",
"!time make_otu_table.py -i uclust-fast_mapped-0.91_v97/seqs_otus.txt -o uclust-fast_mapped-0.91_v97/observation_table.biom"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"real\t3m27.139s\r\n",
"user\t3m25.859s\r\n",
"sys\t0m1.165s\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"real\t0m0.338s\r\n",
"user\t0m0.283s\r\n",
"sys\t0m0.052s\r\n"
]
}
],
"prompt_number": 14
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"88% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time pick_otus.py -i $input_seqs_fp -r $reference_seqs_fp --max_accepts 1 --max_rejects 8 --stepwords 8 --word_length 8 -m uclust_ref -C -o uclust-fast_mapped-0.88_v97/ -s 0.88\n",
"!time make_otu_table.py -i uclust-fast_mapped-0.88_v97/seqs_otus.txt -o uclust-fast_mapped-0.88_v97/observation_table.biom"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"real\t5m6.026s\r\n",
"user\t5m4.795s\r\n",
"sys\t0m1.217s\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\r\n",
"real\t0m0.350s\r\n",
"user\t0m0.295s\r\n",
"sys\t0m0.054s\r\n"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"summarize_observation_tables('uclust-fast_mapped-0.*_v97/observation_table.biom')\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uclust-fast_mapped-0.88_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 77007\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 1419\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uclust-fast_mapped-0.91_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 62639\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 1667\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uclust-fast_mapped-0.94_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 48121\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 852\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uclust-fast_mapped-0.97_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 33315\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 298\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 4
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"uclust-strict (QIIME defaults)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"97% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time pick_otus.py -i $input_seqs_fp -r $reference_seqs_fp -m uclust_ref -C -o uclust-strict_mapped-0.97_v97/ -s 0.97\n",
"!time make_otu_table.py -i uclust-strict_mapped-0.97_v97/seqs_otus.txt -o uclust-strict_mapped-0.97_v97/observation_table.biom"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"94% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time pick_otus.py -i $input_seqs_fp -r $reference_seqs_fp -m uclust_ref -C -o uclust-strict_mapped-0.94_v97/ -s 0.94\n",
"!time make_otu_table.py -i uclust-strict_mapped-0.94_v97/seqs_otus.txt -o uclust-strict_mapped-0.94_v97/observation_table.biom"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"91% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time pick_otus.py -i $input_seqs_fp -r $reference_seqs_fp -m uclust_ref -C -o uclust-strict_mapped-0.91_v97/ -s 0.91\n",
"!time make_otu_table.py -i uclust-strict_mapped-0.91_v97/seqs_otus.txt -o uclust-strict_mapped-0.91_v97/observation_table.biom"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"88% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time pick_otus.py -i $input_seqs_fp -r $reference_seqs_fp -m uclust_ref -C -o uclust-strict_mapped-0.88_v97/ -s 0.88\n",
"!time make_otu_table.py -i uclust-strict_mapped-0.88_v97/seqs_otus.txt -o uclust-strict_mapped-0.88_v97/observation_table.biom"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"summarize_observation_tables('uclust-strict_mapped-0.*_v97/observation_table.biom')\n"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uclust-strict_mapped-0.88_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 77284\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 1331\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uclust-strict_mapped-0.91_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 63879\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 982\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uclust-strict_mapped-0.94_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 49970\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 603\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"uclust-strict_mapped-0.97_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 34771\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 293\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"usearch "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"97% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m usearch -s 0.97 -o usearch_mapped-0.97_v97"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"94% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m usearch -s 0.94 -o usearch_mapped-0.94_v97"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"91% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m usearch -s 0.91 -o usearch_mapped-0.91_v97"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"88% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m usearch -s 0.88 -o usearch_mapped-0.88_v97"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"summarize_observation_tables('usearch_mapped-0.*_v97/observation_table.biom')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"usearch_mapped-0.88_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 88253\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 1831\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"usearch_mapped-0.91_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 73436\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 2373\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"usearch_mapped-0.94_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 53637\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 1245\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"usearch_mapped-0.97_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 37392\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 390\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 6
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"BWA-short"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"bwa doesn't take a percent id, but rather a specified number of allowed mismatches. Here I compute what that number should be based on the mean sequence length computed above"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# hard-coding mean length for now...\n",
"mean_length = 1390\n",
"n97 = str(mean_length - int(0.97 * mean_length))\n",
"n94 = str(mean_length - int(0.94 * mean_length))\n",
"n91 = str(mean_length - int(0.91 * mean_length))\n",
"n88 = str(mean_length - int(0.88 * mean_length))\n",
"print n97, n94, n91, n88"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"42 84 126 167\n"
]
}
],
"prompt_number": 10
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"97% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m bwa-short -o bwa-short_mapped-0.97_v97 --max_diff $n97"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"94% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m bwa-short -o bwa-short_mapped-0.94_v97 --max_diff $n94"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"91% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m bwa-short -o bwa-short_mapped-0.91_v97 --max_diff $n91"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"88% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m bwa-short -o bwa-short_mapped-0.88_v97 --max_diff $n88"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"summarize_observation_tables('bwa-short_mapped-0.*_v97/observation_table.biom')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"bwa-short_mapped-0.88_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 13944\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 110\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"bwa-short_mapped-0.91_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 10499\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 104\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"bwa-short_mapped-0.94_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 6167\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 89\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
" "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"bwa-short_mapped-0.97_v97/observation_table.biom "
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num observations (sequences): 2785\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Num otus: 68\r\n"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "heading",
"level": 1,
"metadata": {},
"source": [
"blat (nt versus nt)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"97% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m blat-nt -s 0.97 -o blat-nt_mapped-0.97_v97"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"94% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m blat-nt -s 0.94 -o blat-nt_mapped-0.94_v97"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"91% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m blat-nt -s 0.91 -o blat-nt_mapped-0.91_v97"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"88% similarity"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"!time map_reads_to_reference.py -i $input_seqs_fp -r $reference_seqs_fp -m blat-nt -s 0.88 -o blat-nt_mapped-0.88_v97"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": "*"
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"summarize_observation_tables('blat-nt_mapped-0.*_v97/observation_table.biom')"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment