Last active
July 28, 2017 09:30
-
-
Save ConstantinoSchillebeeckx/cc04cd8d9fb13a9fa01323153089513c to your computer and use it in GitHub Desktop.
bash script that uses vsearch to run the equivalent of QIIME's pick_open_reference_otus.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
echo "Vsearch started: $(date)"; | |
echo ""; | |
# generate all the proper directories | |
mkdir -p step1_otus; mkdir -p step2_otus; mkdir -p step3_otus; mkdir -p step4_otus | |
# must sort because searching done greedily | |
# see http://drive5.com/usearch/manual/uparseotu_algo.html | |
vsearch --fasta_width 0 --sortbysize seqs.fna -output seqs_sorted.fna; | |
# dereplicate reads | |
vsearch --derep_fulllength seqs_sorted.fna --output seqs_sorted_derep.fna --minuniquesize 1 --fasta_width 0 --sizeout; | |
# STEP 1 | |
# search against Green Genes to generate closed ref OTUs centroids | |
# NOTE: the --db option will need to be updated specifically for your use | |
# NOTE: this step assumes the sequences associated with the --db will be used as the representative sequenes - this is different than default QIIME behavior (see https://groups.google.com/d/msg/qiime-forum/GKVZbG-Lf_s/14HtQWkcBQAJ) | |
# remember to set SET MAX_REJECTS, etc | |
vsearch --fasta_width 0 --usearch_global seqs_sorted_derep.fna --threads 0 --dbmask none --qmask none --id 0.97 --top_hits_only --notmatched step2_otus/closed_ref_fail.fna --db /home/data_repo/pre_processing/otu_support_files/denovo_green_genes/97/rep_set/gg_13_5_pynast_left_2264_right_4052_rep_set.fasta --dbmatched step1_otus/closed_ref_centroids_db.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4; | |
if [ -s step2_otus/closed_ref_fail.fna ]; then | |
# STEP 2 | |
# randomly subsample 10% of failed closed ref reads | |
# rename reads as New.ReferenceOTU | |
# this will already be sorted by abundance since the input is sorted | |
vsearch --fasta_width 0 --fastx_subsample step2_otus/closed_ref_fail.fna --fastaout step2_otus/closed_ref_fail_subsample.fna --sample_pct 10 --relabel New.ReferenceOTU --notrunclabels --relabel_keep; | |
# denovo cluster failed closed ref subsample reads | |
# this will serve as the reference database for new ref OTU | |
vsearch --fasta_width 0 --cluster_size step2_otus/closed_ref_fail_subsample.fna --clusterout_id --centroids step2_otus/new_ref_db.fna --id 0.97 --qmask none --notrunclabels --iddef 4; | |
# STEP 3 | |
# search step 2 failures against new ref DB | |
# hits to DB are New.ReferenceOTU | |
# failures are considered for New.CleanupReferenceOTU | |
vsearch --fasta_width 0 --usearch_global step2_otus/closed_ref_fail.fna --threads 0 --dbmask none --qmask none --rowlen 0 --top_hits_only --notmatched step3_otus/new_ref_fail.fna --db step2_otus/new_ref_db.fna --id 0.97 --dbmatched step3_otus/new_ref_centroids.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4; | |
# STEP 4 | |
# denovo cluster of new ref failures | |
# NOTE: QIIME has an option for skipping this step, see --suppress_step4 (http://qiime.org/scripts/pick_open_reference_otus.html) | |
if [ -s step3_otus/new_ref_fail.fna ]; then | |
vsearch --fasta_width 0 --cluster_size step3_otus/new_ref_fail.fna --clusterout_id --centroid step4_otus/new_ref_cleanup_centroids.fna --id 0.97 --qmask none --relabel New.CleanupReferenceOTU --notrunclabels --relabel_keep; | |
fi | |
fi | |
# cat all OTU centroid files together for final searching against input reads | |
rm rep_set.fna; | |
cat step1_otus/closed_ref_centroids_db.fna step3_otus/new_ref_centroids.fna step4_otus/new_ref_cleanup_centroids.fna >> rep_set.fna; | |
# OR cat step1_otus/closed_ref_centroids.fna step3_otus/new_ref_centroids.fna >> rep_set.fna; | |
# final search of all input reads to OTU centroids for use in generating OTU biom table | |
vsearch --fasta_width 0 --usearch_global seqs_sorted.fna --top_hits_only --threads 0 --dbmask none --qmask none --db rep_set.fna --id 0.97 --uc final.uc --maxaccepts 50 --maxrejects 50 --iddef 4; | |
# generate OTU table | |
biom from-uc -i final.uc -o final.biom; | |
biom summarize-table -i final.biom -o final.log; | |
echo "Vsearch finished: $(date)"; | |
echo ""; | |
# filter out those OTUs present only in a single sample | |
# remove these reads from the rep_set as well | |
filter_otus_from_otu_table.py -i final.biom -o final_ms2.biom -s 2 | |
if [ -s final_ms2.biom ]; then | |
biom summarize-table -i final_ms2.biom -o final_ms2.log; | |
filter_fasta.py -f rep_set.fna -o rep_set_ms2.fna -b final_ms2.biom | |
# generate Newick tree | |
parallel_align_seqs_pynast.py -i rep_set_ms2.fna -o pynast_aligned_seqs -T --jobs_to_start 10 --min_length 75 | |
filter_alignment.py -i pynast_aligned_seqs/rep_set_ms2_aligned.fasta -o pynast_aligned_seqs/ | |
make_phylogeny.py -i pynast_aligned_seqs/rep_set_ms2_aligned_pfiltered.fasta -o rep_set.tre | |
fi |
Author
ConstantinoSchillebeeckx
commented
Aug 26, 2016
•
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment