chapmanb · August 15, 2017 14:51
diff --git a/iconic_ucl_pipeline_changes.diff b/iconic_ucl_pipeline_changes.diff
 --- command_line_HIV_pipeline.sh.orig	2016-12-02 15:10:02.000000000 -0500
 +++ command_line_HIV_pipeline.sh	2017-02-01 22:01:49.000000000 -0500
 @@ -8,40 +8,65 @@
 #$ -wd /path/to/working_directory
 #$ -t 1-400
 
 +set -eu -o pipefail
 +
 +# Find original directory of bash script, resovling symlinks
 +# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in/246128#246128
 +SOURCE="${BASH_SOURCE[0]}"
 +while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
 +    DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
 +    SOURCE="$(readlink "$SOURCE")"
 +    [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
 +done
 +DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
 +pipelineScriptDir=$DIR
 
 # For a full run you only need to specify the results folder and in a tab-delimited file
 # a list of sample IDs and the path to their forward and reverse reads 
 # (these can be gzipped if necessary).
 
 -ResultsFolder=/path/to/results_folder ; 
 -paramfile=/path/to/working_directory/tab_delimited_table_of_sampleIDs_and_readpaths
 +number=${1:-1}
 +cores=8
 +workDir=`pwd`/work
 +ResultsFolder=`pwd`/results
 +paramfile=samples.tsv
 +adaptersFile=$DIR/../Data/adapters.fa
 +decoyFile=$DIR/../Data/hg38_hiv
 +refSeqScaffoldFasta=$DIR/../Data/cons_C.lastz.CWG.fasta
 +refSeqBlastDb=$DIR/../Data/cons_C.fasta
 +#refSeqScaffoldFasta=$DIR/../Data/hiv-1.lastz.CWG.fasta 
 +#refSeqBlastDb=$DIR/../Data/hiv-1.fasta
 +
 # e.g. of paramfile: tab-delimited text file
 # sample1	/path/to/sample1_fwd_reads.fq	/path/to/sample1_rev_reads.fq
 # sample2	/path/to/sample2_fwd_reads.fq	/path/to/sample2_rev_reads.fq
 # etc.
 
 +
 sample=`sed -n ${number}p $paramfile | awk '{print $1}'`
 Reads1=`sed -n ${number}p $paramfile | awk '{print $2}'`
 Reads2=`sed -n ${number}p $paramfile | awk '{print $3}'`
 
 +workDir=$workDir/$sample
 +mkdir -p $workDir
 
 # Set the working directory to either $TMPDIR on the cluster or $wd on desktop
 
 ##################################
 ## For cluster                  ##
 ## i.e. commment out on desktop ##
 -   cd $TMPDIR ; 
 +#cd $TMPDIR ; 
 # # # OR # # # # # # # # #  # # ##
 ## For desktop                  ##
 ## i.e. comment out on Legion   ##
 -#  cd /path/to/working_directory #                
 +cd $workDir
 ##################################
 
 
 
 mkdir -p output/temp;
 #NB IVA fails if /path/to/working_directory/output/assembly exists
 -mkdir logs;
 +mkdir -p logs;
 
 mkdir -p $ResultsFolder/$sample/output/temp ; 
 mkdir -p $ResultsFolder/$sample/output/assembly ;
 @@ -49,13 +74,14 @@
 cd output;
 
 ###TRIM READS###
 -trimmomatic-0.33.jar PE -threads 8 $Reads1 $Reads2 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_unpaired_1.fastq" "temp/"$sample"_hq_2.fastq" "temp/"$sample"_unpaired_2.fastq" ILLUMINACLIP:/path/to/adapters.fasta:2:10:7 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:30 MINLEN:50 ;
 +trimmomatic PE -threads $cores $Reads1 $Reads2 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_unpaired_1.fastq" "temp/"$sample"_hq_2.fastq" "temp/"$sample"_unpaired_2.fastq" ILLUMINACLIP:$adaptersFile:2:10:7 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:30 MINLEN:50 ;
 rm temp/*unpaired_*.fastq ; 
 -cp "temp/"*hq*.fastq $ResultsFolder/$sample"/output/temp/". ; 
 -rm temp/*hq_*.fastq ; 
 +cp "temp/"*hq*.fastq $ResultsFolder/$sample"/output/temp/". ;
 +# Not sure why this gets removed, since referenced in smalt step next
 +#rm temp/*hq_*.fastq ; 
 
 ###DECOY STEP START###
 -smalt map -x -y 0.5 -i 500 -n 8 /path/to/decoy/hg38_hiv_k15_s3 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_hq_2.fastq" | awk '{if ($3 !~ /^chr|\*/ && $7 !~ /^chr|\*/) print $0}' > "temp/"$sample"_pairs.sam" ;
 +smalt map -x -y 0.5 -i 500 -n $cores $decoyFile "temp/"$sample"_hq_1.fastq" "temp/"$sample"_hq_2.fastq" | awk '{if ($3 !~ /^chr|\*/ && $7 !~ /^chr|\*/) print $0}' > "temp/"$sample"_pairs.sam" ;
 samtools view -bhf 64 "temp/"$sample"_pairs.sam" | samtools bam2fq -  > $sample"_filtered_1.fastq" ;
 samtools view -bhf 128 "temp/"$sample"_pairs.sam" | samtools bam2fq -  > $sample"_filtered_2.fastq" ;
 rm temp/*.sam;
 @@ -67,10 +93,10 @@
 
 ###Starting IVA###
 #For now, leave out trimmomatic from IVA - time is of the essence. Can always repeat and see if it makes a difference...
 -iva --max_contigs 50 -t 8 -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly   ;
 +iva --max_contigs 50 -t $cores -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly   ;
 
 # OR if you have primers: I didn't for this example
 -# iva --trimmomatic /path/to/trimmomatic-0.33.jar -pcr_primers /path/to/primers.fasta --adapters /path/to/adapters.fasta --max_contigs 50 -t 8 -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly   ;
 +# iva --trimmomatic /path/to/trimmomatic-0.33.jar -pcr_primers /path/to/primers.fasta --adapters /path/to/adapters.fasta --max_contigs 50 -t $cores -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly   ;
 # # # # 
 
 
 @@ -79,45 +105,45 @@
 ## Copy contigs across to results folder in case job has taken ages - otherwise lose all data when wall clock time is up...###
 cp "assembly/"$sample".contigs.fasta" $ResultsFolder/$sample"/output/assembly"/. ;
 ### IVA done - now finding best reference###
 -lastz "assembly/"$sample".contigs.fasta"[multiple] /path/to/reference/sequences/hiv-1.lastz.CDS.fasta --ambiguous=IUPAC --format=GENERAL > "assembly/"$sample".contigs.lastz" ; 
 -perl -s /path/to/pipeline/scripts/lastz_bestref.pl -contig_lastz="assembly/"$sample.contigs.lastz  -blastdb=/path/to/reference/sequences/BLAST/database/BLAST_viral_DB -best_ref_fasta=$sample"-ref.fasta" -lastz_best_hit_log="../logs/"$sample"_lastz_besthit.log" ; 
 +lastz "assembly/"$sample".contigs.fasta"[multiple] $refSeqScaffoldFasta --ambiguous=iupac --format=general > "assembly/"$sample".contigs.lastz" ; 
 +perl -s $pipelineScriptDir/lastz_bestref.pl -contig_lastz="assembly/"$sample.contigs.lastz  -blastdb=$refSeqBlastDb -best_ref_fasta=$sample"-ref.fasta" -lastz_best_hit_log="../logs/"$sample"_lastz_besthit.log" ; 
 
 ### Have best reference, now assemble draft genome ###
 -lastz $sample"-ref.fasta" "assembly/"$sample".contigs.fasta" --ambiguous=IUPAC > "assembly/"$sample".contigs-vs-bestref.lav" ; 
 -perl -w -s /path/to/pipeline/scripts/lastz_analyser.WITH_REVCOMP.pl -reference_fasta_file=$sample"-ref.fasta" -sample_fasta_file=assembly/$sample".contigs.fasta" -lastz_results_file="assembly/"$sample".contigs-vs-bestref.lav" -cutoff=50000 -with_revcomp=yes -output=temp/$sample".lastz_analysed_file" -log_file="../logs/"$sample"_lastz_analyser.log" ;
 +lastz $sample"-ref.fasta" "assembly/"$sample".contigs.fasta" --ambiguous=iupac > "assembly/"$sample".contigs-vs-bestref.lav" ; 
 +perl -w -s $pipelineScriptDir/lastz_analyser.WITH_REVCOMP.pl -reference_fasta_file=$sample"-ref.fasta" -sample_fasta_file=assembly/$sample".contigs.fasta" -lastz_results_file="assembly/"$sample".contigs-vs-bestref.lav" -cutoff=50000 -with_revcomp=yes -output=temp/$sample".lastz_analysed_file" -log_file="../logs/"$sample"_lastz_analyser.log" ;
 ## map reads to contigs to work out which sequences to choose when contigs overlap
 smalt index -k 15 -s 3 "assembly/"$sample".contigs.k15_s3" "assembly/"$sample".contigs.fasta" ; 
 -smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample".contigs.bam" "assembly/"$sample".contigs.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; 
 -samtools sort -@ 8 -T temp_sort -o "temp/"$sample".contigs.sorted.bam"  "temp/"$sample".contigs.bam"
 +smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample".contigs.bam" "assembly/"$sample".contigs.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; 
 +samtools sort -@ $cores -T temp_sort -o "temp/"$sample".contigs.sorted.bam"  "temp/"$sample".contigs.bam"
 samtools index "temp/"$sample".contigs.sorted.bam"
 samtools mpileup -f "assembly/"$sample".contigs.fasta" -d 1000000 -o "temp/"$sample".contigs.mpileup" "temp/"$sample".contigs.sorted.bam";
 awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample".contigs.mpileup" > $sample".contigs.fake_mpileup" ; 
 -perl -w -s /path/to/pipeline/scripts/genome_maker2b.pl -sample_pileup_file="temp/"$sample".contigs.mpileup" -contigs="assembly/"$sample".contigs.fasta" -reference_mapped_consensus=$sample"-ref.fasta" -lastz_analysed_file="temp/"$sample".lastz_analysed_file" -ref_correct_start=0 -ref_correct_stop=20000 -output="temp/"$sample"-genome.fasta" -logfile="../logs/"$sample"_genome_maker.log" ;
 +perl -w -s $pipelineScriptDir/genome_maker2b.pl -sample_pileup_file="temp/"$sample".contigs.mpileup" -contigs="assembly/"$sample".contigs.fasta" -reference_mapped_consensus=$sample"-ref.fasta" -lastz_analysed_file="temp/"$sample".lastz_analysed_file" -ref_correct_start=0 -ref_correct_stop=20000 -output="temp/"$sample"-genome.fasta" -logfile="../logs/"$sample"_genome_maker.log" ;
 rm temp/*.bam ;
 rm temp/*.mpileup ;
 ##now we've made the draft genome, time to map all the reads onto it and make the first consensus sequence...	
 smalt index -k 15 -s 3 "temp/"$sample"-genome.k15_s3" "temp/"$sample"-genome.fasta" ; 
 -smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample"-genome.bam" "temp/"$sample"-genome.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; 
 -samtools sort -@ 8 -T temp_sort -o "temp/"$sample"-genome.sorted.bam"  "temp/"$sample"-genome.bam"
 +smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample"-genome.bam" "temp/"$sample"-genome.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; 
 +samtools sort -@ $cores -T temp_sort -o "temp/"$sample"-genome.sorted.bam"  "temp/"$sample"-genome.bam"
 samtools index "temp/"$sample"-genome.sorted.bam"
 samtools mpileup -f "temp/"$sample"-genome.fasta" -d 1000000 -o "temp/"$sample"-genome.mpileup" "temp/"$sample"-genome.sorted.bam";
 awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample"-genome.mpileup" > "temp/"$sample"-genome.fake_mpileup" ;
 rm temp/*.bam;
 -perl -w -s /path/to/pipeline/scripts/cons_mv.pl -mpileup="temp/"$sample"-genome.mpileup" -reference_fasta="temp/"$sample"-genome.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus1.preNcut.fasta" -mv_out="temp/"$sample"-genome.fasta.mv" -base_freq_out="temp/"$sample"-genome.fasta.basefreqs.tsv" ;
 -perl -w -s /path/to/pipeline/scripts/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus1.preNcut.fasta" > "temp/"$sample".consensus1.fasta";
 +perl -w -s $pipelineScriptDir/cons_mv.pl -mpileup="temp/"$sample"-genome.mpileup" -reference_fasta="temp/"$sample"-genome.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus1.preNcut.fasta" -mv_out="temp/"$sample"-genome.fasta.mv" -base_freq_out="temp/"$sample"-genome.fasta.basefreqs.tsv" ;
 +perl -w -s $pipelineScriptDir/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus1.preNcut.fasta" > "temp/"$sample".consensus1.fasta";
 rm temp/*.mpileup;
 ##experience shows that one round of iteration is rarely enough so do one more...
 smalt index -k 15 -s 3 "temp/"$sample".consensus1.k15_s3" "temp/"$sample".consensus1.fasta" ; 
 -smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample".consensus1.bam" "temp/"$sample".consensus1.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
 -samtools sort -@ 8 -T temp_sort -o "temp/"$sample".consensus1.sorted.bam"  "temp/"$sample".consensus1.bam" ; 
 +smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample".consensus1.bam" "temp/"$sample".consensus1.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
 +samtools sort -@ $cores -T temp_sort -o "temp/"$sample".consensus1.sorted.bam"  "temp/"$sample".consensus1.bam" ; 
 samtools index "temp/"$sample".consensus1.sorted.bam" ; 
 samtools mpileup -f "temp/"$sample".consensus1.fasta" -d 1000000 -o "temp/"$sample".consensus1.mpileup" "temp/"$sample".consensus1.sorted.bam";
 awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample".consensus1.mpileup" > "temp/"$sample".consensus1.fake_mpileup" ;
 rm temp/*.bam;
 -perl -w -s /path/to/pipeline/scripts/cons_mv.pl -mpileup="temp/"$sample".consensus1.mpileup" -reference_fasta="temp/"$sample".consensus1.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus2.preNcut.fasta" -mv_out="temp/"$sample".consensus1.fasta.mv" -base_freq_out="temp/"$sample".consensus1.fasta.basefreqs.tsv" ;
 -perl -w -s /path/to/pipeline/scripts/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus2.preNcut.fasta" > "temp/"$sample".consensus2.fasta";
 +perl -w -s $pipelineScriptDir/cons_mv.pl -mpileup="temp/"$sample".consensus1.mpileup" -reference_fasta="temp/"$sample".consensus1.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus2.preNcut.fasta" -mv_out="temp/"$sample".consensus1.fasta.mv" -base_freq_out="temp/"$sample".consensus1.fasta.basefreqs.tsv" ;
 +perl -w -s $pipelineScriptDir/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus2.preNcut.fasta" > "temp/"$sample".consensus2.fasta";
 rm temp/*.mpileup;		
 -perl -w -s /path/to/pipeline/scripts/majvarcheck2.pl -mvpath="temp/"$sample".consensus1.fasta.mv" -basefreq="temp/"$sample".consensus1.fasta.basefreqs.tsv" -fwdreads=$sample"_filtered_1.fastq" -revreads=$sample"_filtered_2.fastq";	
 +perl -w -s $pipelineScriptDir/majvarcheck2.pl -mvpath="temp/"$sample".consensus1.fasta.mv" -basefreq="temp/"$sample".consensus1.fasta.basefreqs.tsv" -fwdreads=$sample"_filtered_1.fastq" -revreads=$sample"_filtered_2.fastq";	
 
 ### All done - now copy results from working directory to results directory
 ### This is essential when running a job on the cluster, otherwise can do at your
 --- majvarcheck2.pl.orig	2016-12-02 15:10:03.000000000 -0500
 +++ majvarcheck2.pl	2017-06-22 13:18:44.720066194 -0400
 @@ -75,7 +75,7 @@
       system("smalt index -k 15 -s 3 $smalt_index $consensus_fasta");
       system("smalt map -x -y 0.5 -i 500 -n 8 -o $sam $smalt_index $fwdreads $revreads");
       system("samtools view -@ 8 -bh -S $sam > $bam");
 -      system("samtools sort -@ 8  $bam $sorted");
 +      system("samtools sort -@ 8 -T $bam.tmpsort -o $sorted_bam $bam");
       system("samtools faidx $consensus_fasta");
       system("samtools mpileup -f $consensus_fasta $sorted_bam -d 1000000 > $pileup");
 	$fake_mpileup = $pileup;
	--- command_line_HIV_pipeline.sh.orig 2016-12-02 15:10:02.000000000 -0500
	+++ command_line_HIV_pipeline.sh 2017-02-01 22:01:49.000000000 -0500
	@@ -8,40 +8,65 @@
	#$ -wd /path/to/working_directory
	#$ -t 1-400

	+set -eu -o pipefail
	+
	+# Find original directory of bash script, resovling symlinks
	+# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in/246128#246128
	+SOURCE="${BASH_SOURCE[0]}"
	+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
	+ DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
	+ SOURCE="$(readlink "$SOURCE")"
	+ [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
	+done
	+DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
	+pipelineScriptDir=$DIR

	# For a full run you only need to specify the results folder and in a tab-delimited file
	# a list of sample IDs and the path to their forward and reverse reads
	# (these can be gzipped if necessary).

	-ResultsFolder=/path/to/results_folder ;
	-paramfile=/path/to/working_directory/tab_delimited_table_of_sampleIDs_and_readpaths
	+number=${1:-1}
	+cores=8
	+workDir=`pwd`/work
	+ResultsFolder=`pwd`/results
	+paramfile=samples.tsv
	+adaptersFile=$DIR/../Data/adapters.fa
	+decoyFile=$DIR/../Data/hg38_hiv
	+refSeqScaffoldFasta=$DIR/../Data/cons_C.lastz.CWG.fasta
	+refSeqBlastDb=$DIR/../Data/cons_C.fasta
	+#refSeqScaffoldFasta=$DIR/../Data/hiv-1.lastz.CWG.fasta
	+#refSeqBlastDb=$DIR/../Data/hiv-1.fasta
	+
	# e.g. of paramfile: tab-delimited text file
	# sample1 /path/to/sample1_fwd_reads.fq /path/to/sample1_rev_reads.fq
	# sample2 /path/to/sample2_fwd_reads.fq /path/to/sample2_rev_reads.fq
	# etc.

	+
	sample=`sed -n ${number}p $paramfile \| awk '{print $1}'`
	Reads1=`sed -n ${number}p $paramfile \| awk '{print $2}'`
	Reads2=`sed -n ${number}p $paramfile \| awk '{print $3}'`

	+workDir=$workDir/$sample
	+mkdir -p $workDir

	# Set the working directory to either $TMPDIR on the cluster or $wd on desktop

	##################################
	## For cluster ##
	## i.e. commment out on desktop ##
	- cd $TMPDIR ;
	+#cd $TMPDIR ;
	# # # OR # # # # # # # # # # # ##
	## For desktop ##
	## i.e. comment out on Legion ##
	-# cd /path/to/working_directory #
	+cd $workDir
	##################################



	mkdir -p output/temp;
	#NB IVA fails if /path/to/working_directory/output/assembly exists
	-mkdir logs;
	+mkdir -p logs;

	mkdir -p $ResultsFolder/$sample/output/temp ;
	mkdir -p $ResultsFolder/$sample/output/assembly ;
	@@ -49,13 +74,14 @@
	cd output;

	###TRIM READS###
	-trimmomatic-0.33.jar PE -threads 8 $Reads1 $Reads2 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_unpaired_1.fastq" "temp/"$sample"_hq_2.fastq" "temp/"$sample"_unpaired_2.fastq" ILLUMINACLIP:/path/to/adapters.fasta:2:10:7 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:30 MINLEN:50 ;
	+trimmomatic PE -threads $cores $Reads1 $Reads2 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_unpaired_1.fastq" "temp/"$sample"_hq_2.fastq" "temp/"$sample"_unpaired_2.fastq" ILLUMINACLIP:$adaptersFile:2:10:7 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:30 MINLEN:50 ;
	rm temp/unpaired_.fastq ;
	-cp "temp/"hq.fastq $ResultsFolder/$sample"/output/temp/". ;
	-rm temp/hq_.fastq ;
	+cp "temp/"hq.fastq $ResultsFolder/$sample"/output/temp/". ;
	+# Not sure why this gets removed, since referenced in smalt step next
	+#rm temp/hq_.fastq ;

	###DECOY STEP START###
	-smalt map -x -y 0.5 -i 500 -n 8 /path/to/decoy/hg38_hiv_k15_s3 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_hq_2.fastq" \| awk '{if ($3 !~ /^chr\|\/ && $7 !~ /^chr\|\/) print $0}' > "temp/"$sample"_pairs.sam" ;
	+smalt map -x -y 0.5 -i 500 -n $cores $decoyFile "temp/"$sample"_hq_1.fastq" "temp/"$sample"_hq_2.fastq" \| awk '{if ($3 !~ /^chr\|\/ && $7 !~ /^chr\|\/) print $0}' > "temp/"$sample"_pairs.sam" ;
	samtools view -bhf 64 "temp/"$sample"_pairs.sam" \| samtools bam2fq - > $sample"_filtered_1.fastq" ;
	samtools view -bhf 128 "temp/"$sample"_pairs.sam" \| samtools bam2fq - > $sample"_filtered_2.fastq" ;
	rm temp/*.sam;
	@@ -67,10 +93,10 @@

	###Starting IVA###
	#For now, leave out trimmomatic from IVA - time is of the essence. Can always repeat and see if it makes a difference...
	-iva --max_contigs 50 -t 8 -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ;
	+iva --max_contigs 50 -t $cores -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ;

	# OR if you have primers: I didn't for this example
	-# iva --trimmomatic /path/to/trimmomatic-0.33.jar -pcr_primers /path/to/primers.fasta --adapters /path/to/adapters.fasta --max_contigs 50 -t 8 -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ;
	+# iva --trimmomatic /path/to/trimmomatic-0.33.jar -pcr_primers /path/to/primers.fasta --adapters /path/to/adapters.fasta --max_contigs 50 -t $cores -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ;
	# # # #


	@@ -79,45 +105,45 @@
	## Copy contigs across to results folder in case job has taken ages - otherwise lose all data when wall clock time is up...###
	cp "assembly/"$sample".contigs.fasta" $ResultsFolder/$sample"/output/assembly"/. ;
	### IVA done - now finding best reference###
	-lastz "assembly/"$sample".contigs.fasta"[multiple] /path/to/reference/sequences/hiv-1.lastz.CDS.fasta --ambiguous=IUPAC --format=GENERAL > "assembly/"$sample".contigs.lastz" ;
	-perl -s /path/to/pipeline/scripts/lastz_bestref.pl -contig_lastz="assembly/"$sample.contigs.lastz -blastdb=/path/to/reference/sequences/BLAST/database/BLAST_viral_DB -best_ref_fasta=$sample"-ref.fasta" -lastz_best_hit_log="../logs/"$sample"_lastz_besthit.log" ;
	+lastz "assembly/"$sample".contigs.fasta"[multiple] $refSeqScaffoldFasta --ambiguous=iupac --format=general > "assembly/"$sample".contigs.lastz" ;
	+perl -s $pipelineScriptDir/lastz_bestref.pl -contig_lastz="assembly/"$sample.contigs.lastz -blastdb=$refSeqBlastDb -best_ref_fasta=$sample"-ref.fasta" -lastz_best_hit_log="../logs/"$sample"_lastz_besthit.log" ;

	### Have best reference, now assemble draft genome ###
	-lastz $sample"-ref.fasta" "assembly/"$sample".contigs.fasta" --ambiguous=IUPAC > "assembly/"$sample".contigs-vs-bestref.lav" ;
	-perl -w -s /path/to/pipeline/scripts/lastz_analyser.WITH_REVCOMP.pl -reference_fasta_file=$sample"-ref.fasta" -sample_fasta_file=assembly/$sample".contigs.fasta" -lastz_results_file="assembly/"$sample".contigs-vs-bestref.lav" -cutoff=50000 -with_revcomp=yes -output=temp/$sample".lastz_analysed_file" -log_file="../logs/"$sample"_lastz_analyser.log" ;
	+lastz $sample"-ref.fasta" "assembly/"$sample".contigs.fasta" --ambiguous=iupac > "assembly/"$sample".contigs-vs-bestref.lav" ;
	+perl -w -s $pipelineScriptDir/lastz_analyser.WITH_REVCOMP.pl -reference_fasta_file=$sample"-ref.fasta" -sample_fasta_file=assembly/$sample".contigs.fasta" -lastz_results_file="assembly/"$sample".contigs-vs-bestref.lav" -cutoff=50000 -with_revcomp=yes -output=temp/$sample".lastz_analysed_file" -log_file="../logs/"$sample"_lastz_analyser.log" ;
	## map reads to contigs to work out which sequences to choose when contigs overlap
	smalt index -k 15 -s 3 "assembly/"$sample".contigs.k15_s3" "assembly/"$sample".contigs.fasta" ;
	-smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample".contigs.bam" "assembly/"$sample".contigs.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
	-samtools sort -@ 8 -T temp_sort -o "temp/"$sample".contigs.sorted.bam" "temp/"$sample".contigs.bam"
	+smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample".contigs.bam" "assembly/"$sample".contigs.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
	+samtools sort -@ $cores -T temp_sort -o "temp/"$sample".contigs.sorted.bam" "temp/"$sample".contigs.bam"
	samtools index "temp/"$sample".contigs.sorted.bam"
	samtools mpileup -f "assembly/"$sample".contigs.fasta" -d 1000000 -o "temp/"$sample".contigs.mpileup" "temp/"$sample".contigs.sorted.bam";
	awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample".contigs.mpileup" > $sample".contigs.fake_mpileup" ;
	-perl -w -s /path/to/pipeline/scripts/genome_maker2b.pl -sample_pileup_file="temp/"$sample".contigs.mpileup" -contigs="assembly/"$sample".contigs.fasta" -reference_mapped_consensus=$sample"-ref.fasta" -lastz_analysed_file="temp/"$sample".lastz_analysed_file" -ref_correct_start=0 -ref_correct_stop=20000 -output="temp/"$sample"-genome.fasta" -logfile="../logs/"$sample"_genome_maker.log" ;
	+perl -w -s $pipelineScriptDir/genome_maker2b.pl -sample_pileup_file="temp/"$sample".contigs.mpileup" -contigs="assembly/"$sample".contigs.fasta" -reference_mapped_consensus=$sample"-ref.fasta" -lastz_analysed_file="temp/"$sample".lastz_analysed_file" -ref_correct_start=0 -ref_correct_stop=20000 -output="temp/"$sample"-genome.fasta" -logfile="../logs/"$sample"_genome_maker.log" ;
	rm temp/*.bam ;
	rm temp/*.mpileup ;
	##now we've made the draft genome, time to map all the reads onto it and make the first consensus sequence...
	smalt index -k 15 -s 3 "temp/"$sample"-genome.k15_s3" "temp/"$sample"-genome.fasta" ;
	-smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample"-genome.bam" "temp/"$sample"-genome.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
	-samtools sort -@ 8 -T temp_sort -o "temp/"$sample"-genome.sorted.bam" "temp/"$sample"-genome.bam"
	+smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample"-genome.bam" "temp/"$sample"-genome.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
	+samtools sort -@ $cores -T temp_sort -o "temp/"$sample"-genome.sorted.bam" "temp/"$sample"-genome.bam"
	samtools index "temp/"$sample"-genome.sorted.bam"
	samtools mpileup -f "temp/"$sample"-genome.fasta" -d 1000000 -o "temp/"$sample"-genome.mpileup" "temp/"$sample"-genome.sorted.bam";
	awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample"-genome.mpileup" > "temp/"$sample"-genome.fake_mpileup" ;
	rm temp/*.bam;
	-perl -w -s /path/to/pipeline/scripts/cons_mv.pl -mpileup="temp/"$sample"-genome.mpileup" -reference_fasta="temp/"$sample"-genome.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus1.preNcut.fasta" -mv_out="temp/"$sample"-genome.fasta.mv" -base_freq_out="temp/"$sample"-genome.fasta.basefreqs.tsv" ;
	-perl -w -s /path/to/pipeline/scripts/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus1.preNcut.fasta" > "temp/"$sample".consensus1.fasta";
	+perl -w -s $pipelineScriptDir/cons_mv.pl -mpileup="temp/"$sample"-genome.mpileup" -reference_fasta="temp/"$sample"-genome.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus1.preNcut.fasta" -mv_out="temp/"$sample"-genome.fasta.mv" -base_freq_out="temp/"$sample"-genome.fasta.basefreqs.tsv" ;
	+perl -w -s $pipelineScriptDir/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus1.preNcut.fasta" > "temp/"$sample".consensus1.fasta";
	rm temp/*.mpileup;
	##experience shows that one round of iteration is rarely enough so do one more...
	smalt index -k 15 -s 3 "temp/"$sample".consensus1.k15_s3" "temp/"$sample".consensus1.fasta" ;
	-smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample".consensus1.bam" "temp/"$sample".consensus1.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
	-samtools sort -@ 8 -T temp_sort -o "temp/"$sample".consensus1.sorted.bam" "temp/"$sample".consensus1.bam" ;
	+smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample".consensus1.bam" "temp/"$sample".consensus1.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
	+samtools sort -@ $cores -T temp_sort -o "temp/"$sample".consensus1.sorted.bam" "temp/"$sample".consensus1.bam" ;
	samtools index "temp/"$sample".consensus1.sorted.bam" ;
	samtools mpileup -f "temp/"$sample".consensus1.fasta" -d 1000000 -o "temp/"$sample".consensus1.mpileup" "temp/"$sample".consensus1.sorted.bam";
	awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample".consensus1.mpileup" > "temp/"$sample".consensus1.fake_mpileup" ;
	rm temp/*.bam;
	-perl -w -s /path/to/pipeline/scripts/cons_mv.pl -mpileup="temp/"$sample".consensus1.mpileup" -reference_fasta="temp/"$sample".consensus1.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus2.preNcut.fasta" -mv_out="temp/"$sample".consensus1.fasta.mv" -base_freq_out="temp/"$sample".consensus1.fasta.basefreqs.tsv" ;
	-perl -w -s /path/to/pipeline/scripts/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus2.preNcut.fasta" > "temp/"$sample".consensus2.fasta";
	+perl -w -s $pipelineScriptDir/cons_mv.pl -mpileup="temp/"$sample".consensus1.mpileup" -reference_fasta="temp/"$sample".consensus1.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus2.preNcut.fasta" -mv_out="temp/"$sample".consensus1.fasta.mv" -base_freq_out="temp/"$sample".consensus1.fasta.basefreqs.tsv" ;
	+perl -w -s $pipelineScriptDir/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus2.preNcut.fasta" > "temp/"$sample".consensus2.fasta";
	rm temp/*.mpileup;
	-perl -w -s /path/to/pipeline/scripts/majvarcheck2.pl -mvpath="temp/"$sample".consensus1.fasta.mv" -basefreq="temp/"$sample".consensus1.fasta.basefreqs.tsv" -fwdreads=$sample"_filtered_1.fastq" -revreads=$sample"_filtered_2.fastq";
	+perl -w -s $pipelineScriptDir/majvarcheck2.pl -mvpath="temp/"$sample".consensus1.fasta.mv" -basefreq="temp/"$sample".consensus1.fasta.basefreqs.tsv" -fwdreads=$sample"_filtered_1.fastq" -revreads=$sample"_filtered_2.fastq";

	### All done - now copy results from working directory to results directory
	### This is essential when running a job on the cluster, otherwise can do at your
	--- majvarcheck2.pl.orig 2016-12-02 15:10:03.000000000 -0500
	+++ majvarcheck2.pl 2017-06-22 13:18:44.720066194 -0400
	@@ -75,7 +75,7 @@
	system("smalt index -k 15 -s 3 $smalt_index $consensus_fasta");
	system("smalt map -x -y 0.5 -i 500 -n 8 -o $sam $smalt_index $fwdreads $revreads");
	system("samtools view -@ 8 -bh -S $sam > $bam");
	- system("samtools sort -@ 8 $bam $sorted");
	+ system("samtools sort -@ 8 -T $bam.tmpsort -o $sorted_bam $bam");
	system("samtools faidx $consensus_fasta");
	system("samtools mpileup -f $consensus_fasta $sorted_bam -d 1000000 > $pileup");
	$fake_mpileup = $pileup;