Skip to content

Instantly share code, notes, and snippets.

@chapmanb
Created August 15, 2017 14:51
Show Gist options
  • Save chapmanb/e42e3e3a793f320ddd7c58260aea6d95 to your computer and use it in GitHub Desktop.
Save chapmanb/e42e3e3a793f320ddd7c58260aea6d95 to your computer and use it in GitHub Desktop.
iconic_ucl_pipeline_changes.diff
--- command_line_HIV_pipeline.sh.orig 2016-12-02 15:10:02.000000000 -0500
+++ command_line_HIV_pipeline.sh 2017-02-01 22:01:49.000000000 -0500
@@ -8,40 +8,65 @@
#$ -wd /path/to/working_directory
#$ -t 1-400
+set -eu -o pipefail
+
+# Find original directory of bash script, resovling symlinks
+# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in/246128#246128
+SOURCE="${BASH_SOURCE[0]}"
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
+ DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+ SOURCE="$(readlink "$SOURCE")"
+ [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
+done
+DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
+pipelineScriptDir=$DIR
# For a full run you only need to specify the results folder and in a tab-delimited file
# a list of sample IDs and the path to their forward and reverse reads
# (these can be gzipped if necessary).
-ResultsFolder=/path/to/results_folder ;
-paramfile=/path/to/working_directory/tab_delimited_table_of_sampleIDs_and_readpaths
+number=${1:-1}
+cores=8
+workDir=`pwd`/work
+ResultsFolder=`pwd`/results
+paramfile=samples.tsv
+adaptersFile=$DIR/../Data/adapters.fa
+decoyFile=$DIR/../Data/hg38_hiv
+refSeqScaffoldFasta=$DIR/../Data/cons_C.lastz.CWG.fasta
+refSeqBlastDb=$DIR/../Data/cons_C.fasta
+#refSeqScaffoldFasta=$DIR/../Data/hiv-1.lastz.CWG.fasta
+#refSeqBlastDb=$DIR/../Data/hiv-1.fasta
+
# e.g. of paramfile: tab-delimited text file
# sample1 /path/to/sample1_fwd_reads.fq /path/to/sample1_rev_reads.fq
# sample2 /path/to/sample2_fwd_reads.fq /path/to/sample2_rev_reads.fq
# etc.
+
sample=`sed -n ${number}p $paramfile | awk '{print $1}'`
Reads1=`sed -n ${number}p $paramfile | awk '{print $2}'`
Reads2=`sed -n ${number}p $paramfile | awk '{print $3}'`
+workDir=$workDir/$sample
+mkdir -p $workDir
# Set the working directory to either $TMPDIR on the cluster or $wd on desktop
##################################
## For cluster ##
## i.e. commment out on desktop ##
- cd $TMPDIR ;
+#cd $TMPDIR ;
# # # OR # # # # # # # # # # # ##
## For desktop ##
## i.e. comment out on Legion ##
-# cd /path/to/working_directory #
+cd $workDir
##################################
mkdir -p output/temp;
#NB IVA fails if /path/to/working_directory/output/assembly exists
-mkdir logs;
+mkdir -p logs;
mkdir -p $ResultsFolder/$sample/output/temp ;
mkdir -p $ResultsFolder/$sample/output/assembly ;
@@ -49,13 +74,14 @@
cd output;
###TRIM READS###
-trimmomatic-0.33.jar PE -threads 8 $Reads1 $Reads2 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_unpaired_1.fastq" "temp/"$sample"_hq_2.fastq" "temp/"$sample"_unpaired_2.fastq" ILLUMINACLIP:/path/to/adapters.fasta:2:10:7 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:30 MINLEN:50 ;
+trimmomatic PE -threads $cores $Reads1 $Reads2 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_unpaired_1.fastq" "temp/"$sample"_hq_2.fastq" "temp/"$sample"_unpaired_2.fastq" ILLUMINACLIP:$adaptersFile:2:10:7 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:30 MINLEN:50 ;
rm temp/*unpaired_*.fastq ;
-cp "temp/"*hq*.fastq $ResultsFolder/$sample"/output/temp/". ;
-rm temp/*hq_*.fastq ;
+cp "temp/"*hq*.fastq $ResultsFolder/$sample"/output/temp/". ;
+# Not sure why this gets removed, since referenced in smalt step next
+#rm temp/*hq_*.fastq ;
###DECOY STEP START###
-smalt map -x -y 0.5 -i 500 -n 8 /path/to/decoy/hg38_hiv_k15_s3 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_hq_2.fastq" | awk '{if ($3 !~ /^chr|\*/ && $7 !~ /^chr|\*/) print $0}' > "temp/"$sample"_pairs.sam" ;
+smalt map -x -y 0.5 -i 500 -n $cores $decoyFile "temp/"$sample"_hq_1.fastq" "temp/"$sample"_hq_2.fastq" | awk '{if ($3 !~ /^chr|\*/ && $7 !~ /^chr|\*/) print $0}' > "temp/"$sample"_pairs.sam" ;
samtools view -bhf 64 "temp/"$sample"_pairs.sam" | samtools bam2fq - > $sample"_filtered_1.fastq" ;
samtools view -bhf 128 "temp/"$sample"_pairs.sam" | samtools bam2fq - > $sample"_filtered_2.fastq" ;
rm temp/*.sam;
@@ -67,10 +93,10 @@
###Starting IVA###
#For now, leave out trimmomatic from IVA - time is of the essence. Can always repeat and see if it makes a difference...
-iva --max_contigs 50 -t 8 -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ;
+iva --max_contigs 50 -t $cores -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ;
# OR if you have primers: I didn't for this example
-# iva --trimmomatic /path/to/trimmomatic-0.33.jar -pcr_primers /path/to/primers.fasta --adapters /path/to/adapters.fasta --max_contigs 50 -t 8 -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ;
+# iva --trimmomatic /path/to/trimmomatic-0.33.jar -pcr_primers /path/to/primers.fasta --adapters /path/to/adapters.fasta --max_contigs 50 -t $cores -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ;
# # # #
@@ -79,45 +105,45 @@
## Copy contigs across to results folder in case job has taken ages - otherwise lose all data when wall clock time is up...###
cp "assembly/"$sample".contigs.fasta" $ResultsFolder/$sample"/output/assembly"/. ;
### IVA done - now finding best reference###
-lastz "assembly/"$sample".contigs.fasta"[multiple] /path/to/reference/sequences/hiv-1.lastz.CDS.fasta --ambiguous=IUPAC --format=GENERAL > "assembly/"$sample".contigs.lastz" ;
-perl -s /path/to/pipeline/scripts/lastz_bestref.pl -contig_lastz="assembly/"$sample.contigs.lastz -blastdb=/path/to/reference/sequences/BLAST/database/BLAST_viral_DB -best_ref_fasta=$sample"-ref.fasta" -lastz_best_hit_log="../logs/"$sample"_lastz_besthit.log" ;
+lastz "assembly/"$sample".contigs.fasta"[multiple] $refSeqScaffoldFasta --ambiguous=iupac --format=general > "assembly/"$sample".contigs.lastz" ;
+perl -s $pipelineScriptDir/lastz_bestref.pl -contig_lastz="assembly/"$sample.contigs.lastz -blastdb=$refSeqBlastDb -best_ref_fasta=$sample"-ref.fasta" -lastz_best_hit_log="../logs/"$sample"_lastz_besthit.log" ;
### Have best reference, now assemble draft genome ###
-lastz $sample"-ref.fasta" "assembly/"$sample".contigs.fasta" --ambiguous=IUPAC > "assembly/"$sample".contigs-vs-bestref.lav" ;
-perl -w -s /path/to/pipeline/scripts/lastz_analyser.WITH_REVCOMP.pl -reference_fasta_file=$sample"-ref.fasta" -sample_fasta_file=assembly/$sample".contigs.fasta" -lastz_results_file="assembly/"$sample".contigs-vs-bestref.lav" -cutoff=50000 -with_revcomp=yes -output=temp/$sample".lastz_analysed_file" -log_file="../logs/"$sample"_lastz_analyser.log" ;
+lastz $sample"-ref.fasta" "assembly/"$sample".contigs.fasta" --ambiguous=iupac > "assembly/"$sample".contigs-vs-bestref.lav" ;
+perl -w -s $pipelineScriptDir/lastz_analyser.WITH_REVCOMP.pl -reference_fasta_file=$sample"-ref.fasta" -sample_fasta_file=assembly/$sample".contigs.fasta" -lastz_results_file="assembly/"$sample".contigs-vs-bestref.lav" -cutoff=50000 -with_revcomp=yes -output=temp/$sample".lastz_analysed_file" -log_file="../logs/"$sample"_lastz_analyser.log" ;
## map reads to contigs to work out which sequences to choose when contigs overlap
smalt index -k 15 -s 3 "assembly/"$sample".contigs.k15_s3" "assembly/"$sample".contigs.fasta" ;
-smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample".contigs.bam" "assembly/"$sample".contigs.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
-samtools sort -@ 8 -T temp_sort -o "temp/"$sample".contigs.sorted.bam" "temp/"$sample".contigs.bam"
+smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample".contigs.bam" "assembly/"$sample".contigs.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
+samtools sort -@ $cores -T temp_sort -o "temp/"$sample".contigs.sorted.bam" "temp/"$sample".contigs.bam"
samtools index "temp/"$sample".contigs.sorted.bam"
samtools mpileup -f "assembly/"$sample".contigs.fasta" -d 1000000 -o "temp/"$sample".contigs.mpileup" "temp/"$sample".contigs.sorted.bam";
awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample".contigs.mpileup" > $sample".contigs.fake_mpileup" ;
-perl -w -s /path/to/pipeline/scripts/genome_maker2b.pl -sample_pileup_file="temp/"$sample".contigs.mpileup" -contigs="assembly/"$sample".contigs.fasta" -reference_mapped_consensus=$sample"-ref.fasta" -lastz_analysed_file="temp/"$sample".lastz_analysed_file" -ref_correct_start=0 -ref_correct_stop=20000 -output="temp/"$sample"-genome.fasta" -logfile="../logs/"$sample"_genome_maker.log" ;
+perl -w -s $pipelineScriptDir/genome_maker2b.pl -sample_pileup_file="temp/"$sample".contigs.mpileup" -contigs="assembly/"$sample".contigs.fasta" -reference_mapped_consensus=$sample"-ref.fasta" -lastz_analysed_file="temp/"$sample".lastz_analysed_file" -ref_correct_start=0 -ref_correct_stop=20000 -output="temp/"$sample"-genome.fasta" -logfile="../logs/"$sample"_genome_maker.log" ;
rm temp/*.bam ;
rm temp/*.mpileup ;
##now we've made the draft genome, time to map all the reads onto it and make the first consensus sequence...
smalt index -k 15 -s 3 "temp/"$sample"-genome.k15_s3" "temp/"$sample"-genome.fasta" ;
-smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample"-genome.bam" "temp/"$sample"-genome.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
-samtools sort -@ 8 -T temp_sort -o "temp/"$sample"-genome.sorted.bam" "temp/"$sample"-genome.bam"
+smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample"-genome.bam" "temp/"$sample"-genome.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
+samtools sort -@ $cores -T temp_sort -o "temp/"$sample"-genome.sorted.bam" "temp/"$sample"-genome.bam"
samtools index "temp/"$sample"-genome.sorted.bam"
samtools mpileup -f "temp/"$sample"-genome.fasta" -d 1000000 -o "temp/"$sample"-genome.mpileup" "temp/"$sample"-genome.sorted.bam";
awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample"-genome.mpileup" > "temp/"$sample"-genome.fake_mpileup" ;
rm temp/*.bam;
-perl -w -s /path/to/pipeline/scripts/cons_mv.pl -mpileup="temp/"$sample"-genome.mpileup" -reference_fasta="temp/"$sample"-genome.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus1.preNcut.fasta" -mv_out="temp/"$sample"-genome.fasta.mv" -base_freq_out="temp/"$sample"-genome.fasta.basefreqs.tsv" ;
-perl -w -s /path/to/pipeline/scripts/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus1.preNcut.fasta" > "temp/"$sample".consensus1.fasta";
+perl -w -s $pipelineScriptDir/cons_mv.pl -mpileup="temp/"$sample"-genome.mpileup" -reference_fasta="temp/"$sample"-genome.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus1.preNcut.fasta" -mv_out="temp/"$sample"-genome.fasta.mv" -base_freq_out="temp/"$sample"-genome.fasta.basefreqs.tsv" ;
+perl -w -s $pipelineScriptDir/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus1.preNcut.fasta" > "temp/"$sample".consensus1.fasta";
rm temp/*.mpileup;
##experience shows that one round of iteration is rarely enough so do one more...
smalt index -k 15 -s 3 "temp/"$sample".consensus1.k15_s3" "temp/"$sample".consensus1.fasta" ;
-smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample".consensus1.bam" "temp/"$sample".consensus1.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
-samtools sort -@ 8 -T temp_sort -o "temp/"$sample".consensus1.sorted.bam" "temp/"$sample".consensus1.bam" ;
+smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample".consensus1.bam" "temp/"$sample".consensus1.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ;
+samtools sort -@ $cores -T temp_sort -o "temp/"$sample".consensus1.sorted.bam" "temp/"$sample".consensus1.bam" ;
samtools index "temp/"$sample".consensus1.sorted.bam" ;
samtools mpileup -f "temp/"$sample".consensus1.fasta" -d 1000000 -o "temp/"$sample".consensus1.mpileup" "temp/"$sample".consensus1.sorted.bam";
awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample".consensus1.mpileup" > "temp/"$sample".consensus1.fake_mpileup" ;
rm temp/*.bam;
-perl -w -s /path/to/pipeline/scripts/cons_mv.pl -mpileup="temp/"$sample".consensus1.mpileup" -reference_fasta="temp/"$sample".consensus1.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus2.preNcut.fasta" -mv_out="temp/"$sample".consensus1.fasta.mv" -base_freq_out="temp/"$sample".consensus1.fasta.basefreqs.tsv" ;
-perl -w -s /path/to/pipeline/scripts/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus2.preNcut.fasta" > "temp/"$sample".consensus2.fasta";
+perl -w -s $pipelineScriptDir/cons_mv.pl -mpileup="temp/"$sample".consensus1.mpileup" -reference_fasta="temp/"$sample".consensus1.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus2.preNcut.fasta" -mv_out="temp/"$sample".consensus1.fasta.mv" -base_freq_out="temp/"$sample".consensus1.fasta.basefreqs.tsv" ;
+perl -w -s $pipelineScriptDir/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus2.preNcut.fasta" > "temp/"$sample".consensus2.fasta";
rm temp/*.mpileup;
-perl -w -s /path/to/pipeline/scripts/majvarcheck2.pl -mvpath="temp/"$sample".consensus1.fasta.mv" -basefreq="temp/"$sample".consensus1.fasta.basefreqs.tsv" -fwdreads=$sample"_filtered_1.fastq" -revreads=$sample"_filtered_2.fastq";
+perl -w -s $pipelineScriptDir/majvarcheck2.pl -mvpath="temp/"$sample".consensus1.fasta.mv" -basefreq="temp/"$sample".consensus1.fasta.basefreqs.tsv" -fwdreads=$sample"_filtered_1.fastq" -revreads=$sample"_filtered_2.fastq";
### All done - now copy results from working directory to results directory
### This is essential when running a job on the cluster, otherwise can do at your
--- majvarcheck2.pl.orig 2016-12-02 15:10:03.000000000 -0500
+++ majvarcheck2.pl 2017-06-22 13:18:44.720066194 -0400
@@ -75,7 +75,7 @@
system("smalt index -k 15 -s 3 $smalt_index $consensus_fasta");
system("smalt map -x -y 0.5 -i 500 -n 8 -o $sam $smalt_index $fwdreads $revreads");
system("samtools view -@ 8 -bh -S $sam > $bam");
- system("samtools sort -@ 8 $bam $sorted");
+ system("samtools sort -@ 8 -T $bam.tmpsort -o $sorted_bam $bam");
system("samtools faidx $consensus_fasta");
system("samtools mpileup -f $consensus_fasta $sorted_bam -d 1000000 > $pileup");
$fake_mpileup = $pileup;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment