Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ConstantinoSchillebeeckx/cc04cd8d9fb13a9fa01323153089513c to your computer and use it in GitHub Desktop.
Save ConstantinoSchillebeeckx/cc04cd8d9fb13a9fa01323153089513c to your computer and use it in GitHub Desktop.
bash script that uses vsearch to run the equivalent of QIIME's pick_open_reference_otus.py
#!/bin/bash
echo "Vsearch started: $(date)";
echo "";
# generate all the proper directories
mkdir -p step1_otus; mkdir -p step2_otus; mkdir -p step3_otus; mkdir -p step4_otus
# must sort because searching done greedily
# see http://drive5.com/usearch/manual/uparseotu_algo.html
vsearch --fasta_width 0 --sortbysize seqs.fna -output seqs_sorted.fna;
# dereplicate reads
vsearch --derep_fulllength seqs_sorted.fna --output seqs_sorted_derep.fna --minuniquesize 1 --fasta_width 0 --sizeout;
# STEP 1
# search against Green Genes to generate closed ref OTUs centroids
# NOTE: the --db option will need to be updated specifically for your use
# NOTE: this step assumes the sequences associated with the --db will be used as the representative sequenes - this is different than default QIIME behavior (see https://groups.google.com/d/msg/qiime-forum/GKVZbG-Lf_s/14HtQWkcBQAJ)
# remember to set SET MAX_REJECTS, etc
vsearch --fasta_width 0 --usearch_global seqs_sorted_derep.fna --threads 0 --dbmask none --qmask none --id 0.97 --top_hits_only --notmatched step2_otus/closed_ref_fail.fna --db /home/data_repo/pre_processing/otu_support_files/denovo_green_genes/97/rep_set/gg_13_5_pynast_left_2264_right_4052_rep_set.fasta --dbmatched step1_otus/closed_ref_centroids_db.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4;
if [ -s step2_otus/closed_ref_fail.fna ]; then
# STEP 2
# randomly subsample 10% of failed closed ref reads
# rename reads as New.ReferenceOTU
# this will already be sorted by abundance since the input is sorted
vsearch --fasta_width 0 --fastx_subsample step2_otus/closed_ref_fail.fna --fastaout step2_otus/closed_ref_fail_subsample.fna --sample_pct 10 --relabel New.ReferenceOTU --notrunclabels --relabel_keep;
# denovo cluster failed closed ref subsample reads
# this will serve as the reference database for new ref OTU
vsearch --fasta_width 0 --cluster_size step2_otus/closed_ref_fail_subsample.fna --clusterout_id --centroids step2_otus/new_ref_db.fna --id 0.97 --qmask none --notrunclabels --iddef 4;
# STEP 3
# search step 2 failures against new ref DB
# hits to DB are New.ReferenceOTU
# failures are considered for New.CleanupReferenceOTU
vsearch --fasta_width 0 --usearch_global step2_otus/closed_ref_fail.fna --threads 0 --dbmask none --qmask none --rowlen 0 --top_hits_only --notmatched step3_otus/new_ref_fail.fna --db step2_otus/new_ref_db.fna --id 0.97 --dbmatched step3_otus/new_ref_centroids.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4;
# STEP 4
# denovo cluster of new ref failures
# NOTE: QIIME has an option for skipping this step, see --suppress_step4 (http://qiime.org/scripts/pick_open_reference_otus.html)
if [ -s step3_otus/new_ref_fail.fna ]; then
vsearch --fasta_width 0 --cluster_size step3_otus/new_ref_fail.fna --clusterout_id --centroid step4_otus/new_ref_cleanup_centroids.fna --id 0.97 --qmask none --relabel New.CleanupReferenceOTU --notrunclabels --relabel_keep;
fi
fi
# cat all OTU centroid files together for final searching against input reads
rm rep_set.fna;
cat step1_otus/closed_ref_centroids_db.fna step3_otus/new_ref_centroids.fna step4_otus/new_ref_cleanup_centroids.fna >> rep_set.fna;
# OR cat step1_otus/closed_ref_centroids.fna step3_otus/new_ref_centroids.fna >> rep_set.fna;
# final search of all input reads to OTU centroids for use in generating OTU biom table
vsearch --fasta_width 0 --usearch_global seqs_sorted.fna --top_hits_only --threads 0 --dbmask none --qmask none --db rep_set.fna --id 0.97 --uc final.uc --maxaccepts 50 --maxrejects 50 --iddef 4;
# generate OTU table
biom from-uc -i final.uc -o final.biom;
biom summarize-table -i final.biom -o final.log;
echo "Vsearch finished: $(date)";
echo "";
# filter out those OTUs present only in a single sample
# remove these reads from the rep_set as well
filter_otus_from_otu_table.py -i final.biom -o final_ms2.biom -s 2
if [ -s final_ms2.biom ]; then
biom summarize-table -i final_ms2.biom -o final_ms2.log;
filter_fasta.py -f rep_set.fna -o rep_set_ms2.fna -b final_ms2.biom
# generate Newick tree
parallel_align_seqs_pynast.py -i rep_set_ms2.fna -o pynast_aligned_seqs -T --jobs_to_start 10 --min_length 75
filter_alignment.py -i pynast_aligned_seqs/rep_set_ms2_aligned.fasta -o pynast_aligned_seqs/
make_phylogeny.py -i pynast_aligned_seqs/rep_set_ms2_aligned_pfiltered.fasta -o rep_set.tre
fi
@ConstantinoSchillebeeckx
Copy link
Author

ConstantinoSchillebeeckx commented Aug 26, 2016

fig-1-full

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment