ConstantinoSchillebeeckx · July 28, 2017 09:30 · ConstantinoSchillebeeckx · Aug 26, 2016
diff --git a/vsearch_pick_open_reference_otus.sh b/vsearch_pick_open_reference_otus.sh
 #!/bin/bash

 echo "Vsearch started: $(date)";
 echo "";

 # generate all the proper directories
 mkdir -p step1_otus; mkdir -p step2_otus; mkdir -p step3_otus; mkdir -p step4_otus

 # must sort because searching done greedily
 # see http://drive5.com/usearch/manual/uparseotu_algo.html
 vsearch --fasta_width 0 --sortbysize seqs.fna -output seqs_sorted.fna;

 # dereplicate reads
 vsearch --derep_fulllength seqs_sorted.fna --output seqs_sorted_derep.fna --minuniquesize 1 --fasta_width 0 --sizeout;

 # STEP 1
 # search against Green Genes to generate closed ref OTUs centroids
 # NOTE: the --db option will need to be updated specifically for your use
 # NOTE: this step assumes the sequences associated with the --db will be used as the representative sequenes - this is different than default QIIME behavior (see https://groups.google.com/d/msg/qiime-forum/GKVZbG-Lf_s/14HtQWkcBQAJ)
 # remember to set SET MAX_REJECTS, etc
 vsearch --fasta_width 0 --usearch_global seqs_sorted_derep.fna --threads 0 --dbmask none --qmask none --id 0.97 --top_hits_only --notmatched step2_otus/closed_ref_fail.fna --db /home/data_repo/pre_processing/otu_support_files/denovo_green_genes/97/rep_set/gg_13_5_pynast_left_2264_right_4052_rep_set.fasta --dbmatched step1_otus/closed_ref_centroids_db.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4;


 if [ -s step2_otus/closed_ref_fail.fna ]; then
    # STEP 2
    # randomly subsample 10% of failed closed ref reads
    # rename reads as New.ReferenceOTU
    # this will already be sorted by abundance since the input is sorted
    vsearch --fasta_width 0 --fastx_subsample step2_otus/closed_ref_fail.fna --fastaout step2_otus/closed_ref_fail_subsample.fna --sample_pct 10 --relabel New.ReferenceOTU --notrunclabels --relabel_keep;

    # denovo cluster failed closed ref subsample reads
    # this will serve as the reference database for new ref OTU
    vsearch --fasta_width 0 --cluster_size step2_otus/closed_ref_fail_subsample.fna --clusterout_id --centroids step2_otus/new_ref_db.fna --id 0.97 --qmask none --notrunclabels --iddef 4;

    # STEP 3
    # search step 2 failures against new ref DB
    # hits to DB are New.ReferenceOTU
    # failures are considered for New.CleanupReferenceOTU
    vsearch --fasta_width 0 --usearch_global step2_otus/closed_ref_fail.fna --threads 0 --dbmask none --qmask none --rowlen 0 --top_hits_only --notmatched step3_otus/new_ref_fail.fna --db step2_otus/new_ref_db.fna --id 0.97 --dbmatched step3_otus/new_ref_centroids.fna --notrunclabels  --maxaccepts 50 --maxrejects 50 --iddef 4;

    # STEP 4
    # denovo cluster of new ref failures
    # NOTE: QIIME has an option for skipping this step, see --suppress_step4 (http://qiime.org/scripts/pick_open_reference_otus.html)
    if [ -s step3_otus/new_ref_fail.fna ]; then
        vsearch --fasta_width 0 --cluster_size step3_otus/new_ref_fail.fna --clusterout_id --centroid step4_otus/new_ref_cleanup_centroids.fna --id 0.97 --qmask none --relabel New.CleanupReferenceOTU --notrunclabels --relabel_keep;
    fi

 fi

 # cat all OTU centroid files together for final searching against input reads
 rm rep_set.fna;
 cat step1_otus/closed_ref_centroids_db.fna step3_otus/new_ref_centroids.fna step4_otus/new_ref_cleanup_centroids.fna >> rep_set.fna;
 # OR cat step1_otus/closed_ref_centroids.fna step3_otus/new_ref_centroids.fna >> rep_set.fna;

 # final search of all input reads to OTU centroids for use in generating OTU biom table
 vsearch --fasta_width 0 --usearch_global seqs_sorted.fna --top_hits_only --threads 0 --dbmask none --qmask none --db rep_set.fna --id 0.97 --uc final.uc --maxaccepts 50 --maxrejects 50 --iddef 4;

 # generate OTU table
 biom from-uc -i final.uc -o final.biom;
 biom summarize-table -i final.biom -o final.log;

 echo "Vsearch finished: $(date)";
 echo "";


 # filter out those OTUs present only in a single sample
 # remove these reads from the rep_set as well
 filter_otus_from_otu_table.py -i final.biom -o final_ms2.biom -s 2
 if [ -s final_ms2.biom ]; then
    biom summarize-table -i final_ms2.biom -o final_ms2.log;
    filter_fasta.py -f rep_set.fna -o rep_set_ms2.fna -b final_ms2.biom


    # generate Newick tree
    parallel_align_seqs_pynast.py -i rep_set_ms2.fna -o pynast_aligned_seqs -T --jobs_to_start 10 --min_length 75
    filter_alignment.py -i pynast_aligned_seqs/rep_set_ms2_aligned.fasta -o pynast_aligned_seqs/
    make_phylogeny.py -i pynast_aligned_seqs/rep_set_ms2_aligned_pfiltered.fasta -o rep_set.tre
 fi
	#!/bin/bash

	echo "Vsearch started: $(date)";
	echo "";

	# generate all the proper directories
	mkdir -p step1_otus; mkdir -p step2_otus; mkdir -p step3_otus; mkdir -p step4_otus

	# must sort because searching done greedily
	# see http://drive5.com/usearch/manual/uparseotu_algo.html
	vsearch --fasta_width 0 --sortbysize seqs.fna -output seqs_sorted.fna;

	# dereplicate reads
	vsearch --derep_fulllength seqs_sorted.fna --output seqs_sorted_derep.fna --minuniquesize 1 --fasta_width 0 --sizeout;

	# STEP 1
	# search against Green Genes to generate closed ref OTUs centroids
	# NOTE: the --db option will need to be updated specifically for your use
	# NOTE: this step assumes the sequences associated with the --db will be used as the representative sequenes - this is different than default QIIME behavior (see https://groups.google.com/d/msg/qiime-forum/GKVZbG-Lf_s/14HtQWkcBQAJ)
	# remember to set SET MAX_REJECTS, etc
	vsearch --fasta_width 0 --usearch_global seqs_sorted_derep.fna --threads 0 --dbmask none --qmask none --id 0.97 --top_hits_only --notmatched step2_otus/closed_ref_fail.fna --db /home/data_repo/pre_processing/otu_support_files/denovo_green_genes/97/rep_set/gg_13_5_pynast_left_2264_right_4052_rep_set.fasta --dbmatched step1_otus/closed_ref_centroids_db.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4;


	if [ -s step2_otus/closed_ref_fail.fna ]; then
	# STEP 2
	# randomly subsample 10% of failed closed ref reads
	# rename reads as New.ReferenceOTU
	# this will already be sorted by abundance since the input is sorted
	vsearch --fasta_width 0 --fastx_subsample step2_otus/closed_ref_fail.fna --fastaout step2_otus/closed_ref_fail_subsample.fna --sample_pct 10 --relabel New.ReferenceOTU --notrunclabels --relabel_keep;

	# denovo cluster failed closed ref subsample reads
	# this will serve as the reference database for new ref OTU
	vsearch --fasta_width 0 --cluster_size step2_otus/closed_ref_fail_subsample.fna --clusterout_id --centroids step2_otus/new_ref_db.fna --id 0.97 --qmask none --notrunclabels --iddef 4;

	# STEP 3
	# search step 2 failures against new ref DB
	# hits to DB are New.ReferenceOTU
	# failures are considered for New.CleanupReferenceOTU
	vsearch --fasta_width 0 --usearch_global step2_otus/closed_ref_fail.fna --threads 0 --dbmask none --qmask none --rowlen 0 --top_hits_only --notmatched step3_otus/new_ref_fail.fna --db step2_otus/new_ref_db.fna --id 0.97 --dbmatched step3_otus/new_ref_centroids.fna --notrunclabels --maxaccepts 50 --maxrejects 50 --iddef 4;

	# STEP 4
	# denovo cluster of new ref failures
	# NOTE: QIIME has an option for skipping this step, see --suppress_step4 (http://qiime.org/scripts/pick_open_reference_otus.html)
	if [ -s step3_otus/new_ref_fail.fna ]; then
	vsearch --fasta_width 0 --cluster_size step3_otus/new_ref_fail.fna --clusterout_id --centroid step4_otus/new_ref_cleanup_centroids.fna --id 0.97 --qmask none --relabel New.CleanupReferenceOTU --notrunclabels --relabel_keep;
	fi

	fi

	# cat all OTU centroid files together for final searching against input reads
	rm rep_set.fna;
	cat step1_otus/closed_ref_centroids_db.fna step3_otus/new_ref_centroids.fna step4_otus/new_ref_cleanup_centroids.fna >> rep_set.fna;
	# OR cat step1_otus/closed_ref_centroids.fna step3_otus/new_ref_centroids.fna >> rep_set.fna;

	# final search of all input reads to OTU centroids for use in generating OTU biom table
	vsearch --fasta_width 0 --usearch_global seqs_sorted.fna --top_hits_only --threads 0 --dbmask none --qmask none --db rep_set.fna --id 0.97 --uc final.uc --maxaccepts 50 --maxrejects 50 --iddef 4;

	# generate OTU table
	biom from-uc -i final.uc -o final.biom;
	biom summarize-table -i final.biom -o final.log;

	echo "Vsearch finished: $(date)";
	echo "";


	# filter out those OTUs present only in a single sample
	# remove these reads from the rep_set as well
	filter_otus_from_otu_table.py -i final.biom -o final_ms2.biom -s 2
	if [ -s final_ms2.biom ]; then
	biom summarize-table -i final_ms2.biom -o final_ms2.log;
	filter_fasta.py -f rep_set.fna -o rep_set_ms2.fna -b final_ms2.biom


	# generate Newick tree
	parallel_align_seqs_pynast.py -i rep_set_ms2.fna -o pynast_aligned_seqs -T --jobs_to_start 10 --min_length 75
	filter_alignment.py -i pynast_aligned_seqs/rep_set_ms2_aligned.fasta -o pynast_aligned_seqs/
	make_phylogeny.py -i pynast_aligned_seqs/rep_set_ms2_aligned_pfiltered.fasta -o rep_set.tre
	fi