Created
August 15, 2017 14:51
-
-
Save chapmanb/e42e3e3a793f320ddd7c58260aea6d95 to your computer and use it in GitHub Desktop.
iconic_ucl_pipeline_changes.diff
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- command_line_HIV_pipeline.sh.orig 2016-12-02 15:10:02.000000000 -0500 | |
+++ command_line_HIV_pipeline.sh 2017-02-01 22:01:49.000000000 -0500 | |
@@ -8,40 +8,65 @@ | |
#$ -wd /path/to/working_directory | |
#$ -t 1-400 | |
+set -eu -o pipefail | |
+ | |
+# Find original directory of bash script, resovling symlinks | |
+# http://stackoverflow.com/questions/59895/can-a-bash-script-tell-what-directory-its-stored-in/246128#246128 | |
+SOURCE="${BASH_SOURCE[0]}" | |
+while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink | |
+ DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )" | |
+ SOURCE="$(readlink "$SOURCE")" | |
+ [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located | |
+done | |
+DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )" | |
+pipelineScriptDir=$DIR | |
# For a full run you only need to specify the results folder and in a tab-delimited file | |
# a list of sample IDs and the path to their forward and reverse reads | |
# (these can be gzipped if necessary). | |
-ResultsFolder=/path/to/results_folder ; | |
-paramfile=/path/to/working_directory/tab_delimited_table_of_sampleIDs_and_readpaths | |
+number=${1:-1} | |
+cores=8 | |
+workDir=`pwd`/work | |
+ResultsFolder=`pwd`/results | |
+paramfile=samples.tsv | |
+adaptersFile=$DIR/../Data/adapters.fa | |
+decoyFile=$DIR/../Data/hg38_hiv | |
+refSeqScaffoldFasta=$DIR/../Data/cons_C.lastz.CWG.fasta | |
+refSeqBlastDb=$DIR/../Data/cons_C.fasta | |
+#refSeqScaffoldFasta=$DIR/../Data/hiv-1.lastz.CWG.fasta | |
+#refSeqBlastDb=$DIR/../Data/hiv-1.fasta | |
+ | |
# e.g. of paramfile: tab-delimited text file | |
# sample1 /path/to/sample1_fwd_reads.fq /path/to/sample1_rev_reads.fq | |
# sample2 /path/to/sample2_fwd_reads.fq /path/to/sample2_rev_reads.fq | |
# etc. | |
+ | |
sample=`sed -n ${number}p $paramfile | awk '{print $1}'` | |
Reads1=`sed -n ${number}p $paramfile | awk '{print $2}'` | |
Reads2=`sed -n ${number}p $paramfile | awk '{print $3}'` | |
+workDir=$workDir/$sample | |
+mkdir -p $workDir | |
# Set the working directory to either $TMPDIR on the cluster or $wd on desktop | |
################################## | |
## For cluster ## | |
## i.e. commment out on desktop ## | |
- cd $TMPDIR ; | |
+#cd $TMPDIR ; | |
# # # OR # # # # # # # # # # # ## | |
## For desktop ## | |
## i.e. comment out on Legion ## | |
-# cd /path/to/working_directory # | |
+cd $workDir | |
################################## | |
mkdir -p output/temp; | |
#NB IVA fails if /path/to/working_directory/output/assembly exists | |
-mkdir logs; | |
+mkdir -p logs; | |
mkdir -p $ResultsFolder/$sample/output/temp ; | |
mkdir -p $ResultsFolder/$sample/output/assembly ; | |
@@ -49,13 +74,14 @@ | |
cd output; | |
###TRIM READS### | |
-trimmomatic-0.33.jar PE -threads 8 $Reads1 $Reads2 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_unpaired_1.fastq" "temp/"$sample"_hq_2.fastq" "temp/"$sample"_unpaired_2.fastq" ILLUMINACLIP:/path/to/adapters.fasta:2:10:7 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:30 MINLEN:50 ; | |
+trimmomatic PE -threads $cores $Reads1 $Reads2 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_unpaired_1.fastq" "temp/"$sample"_hq_2.fastq" "temp/"$sample"_unpaired_2.fastq" ILLUMINACLIP:$adaptersFile:2:10:7 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:30 MINLEN:50 ; | |
rm temp/*unpaired_*.fastq ; | |
-cp "temp/"*hq*.fastq $ResultsFolder/$sample"/output/temp/". ; | |
-rm temp/*hq_*.fastq ; | |
+cp "temp/"*hq*.fastq $ResultsFolder/$sample"/output/temp/". ; | |
+# Not sure why this gets removed, since referenced in smalt step next | |
+#rm temp/*hq_*.fastq ; | |
###DECOY STEP START### | |
-smalt map -x -y 0.5 -i 500 -n 8 /path/to/decoy/hg38_hiv_k15_s3 "temp/"$sample"_hq_1.fastq" "temp/"$sample"_hq_2.fastq" | awk '{if ($3 !~ /^chr|\*/ && $7 !~ /^chr|\*/) print $0}' > "temp/"$sample"_pairs.sam" ; | |
+smalt map -x -y 0.5 -i 500 -n $cores $decoyFile "temp/"$sample"_hq_1.fastq" "temp/"$sample"_hq_2.fastq" | awk '{if ($3 !~ /^chr|\*/ && $7 !~ /^chr|\*/) print $0}' > "temp/"$sample"_pairs.sam" ; | |
samtools view -bhf 64 "temp/"$sample"_pairs.sam" | samtools bam2fq - > $sample"_filtered_1.fastq" ; | |
samtools view -bhf 128 "temp/"$sample"_pairs.sam" | samtools bam2fq - > $sample"_filtered_2.fastq" ; | |
rm temp/*.sam; | |
@@ -67,10 +93,10 @@ | |
###Starting IVA### | |
#For now, leave out trimmomatic from IVA - time is of the essence. Can always repeat and see if it makes a difference... | |
-iva --max_contigs 50 -t 8 -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ; | |
+iva --max_contigs 50 -t $cores -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ; | |
# OR if you have primers: I didn't for this example | |
-# iva --trimmomatic /path/to/trimmomatic-0.33.jar -pcr_primers /path/to/primers.fasta --adapters /path/to/adapters.fasta --max_contigs 50 -t 8 -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ; | |
+# iva --trimmomatic /path/to/trimmomatic-0.33.jar -pcr_primers /path/to/primers.fasta --adapters /path/to/adapters.fasta --max_contigs 50 -t $cores -f $sample"_filtered_1.fastq" -r $sample"_filtered_2.fastq" assembly ; | |
# # # # | |
@@ -79,45 +105,45 @@ | |
## Copy contigs across to results folder in case job has taken ages - otherwise lose all data when wall clock time is up...### | |
cp "assembly/"$sample".contigs.fasta" $ResultsFolder/$sample"/output/assembly"/. ; | |
### IVA done - now finding best reference### | |
-lastz "assembly/"$sample".contigs.fasta"[multiple] /path/to/reference/sequences/hiv-1.lastz.CDS.fasta --ambiguous=IUPAC --format=GENERAL > "assembly/"$sample".contigs.lastz" ; | |
-perl -s /path/to/pipeline/scripts/lastz_bestref.pl -contig_lastz="assembly/"$sample.contigs.lastz -blastdb=/path/to/reference/sequences/BLAST/database/BLAST_viral_DB -best_ref_fasta=$sample"-ref.fasta" -lastz_best_hit_log="../logs/"$sample"_lastz_besthit.log" ; | |
+lastz "assembly/"$sample".contigs.fasta"[multiple] $refSeqScaffoldFasta --ambiguous=iupac --format=general > "assembly/"$sample".contigs.lastz" ; | |
+perl -s $pipelineScriptDir/lastz_bestref.pl -contig_lastz="assembly/"$sample.contigs.lastz -blastdb=$refSeqBlastDb -best_ref_fasta=$sample"-ref.fasta" -lastz_best_hit_log="../logs/"$sample"_lastz_besthit.log" ; | |
### Have best reference, now assemble draft genome ### | |
-lastz $sample"-ref.fasta" "assembly/"$sample".contigs.fasta" --ambiguous=IUPAC > "assembly/"$sample".contigs-vs-bestref.lav" ; | |
-perl -w -s /path/to/pipeline/scripts/lastz_analyser.WITH_REVCOMP.pl -reference_fasta_file=$sample"-ref.fasta" -sample_fasta_file=assembly/$sample".contigs.fasta" -lastz_results_file="assembly/"$sample".contigs-vs-bestref.lav" -cutoff=50000 -with_revcomp=yes -output=temp/$sample".lastz_analysed_file" -log_file="../logs/"$sample"_lastz_analyser.log" ; | |
+lastz $sample"-ref.fasta" "assembly/"$sample".contigs.fasta" --ambiguous=iupac > "assembly/"$sample".contigs-vs-bestref.lav" ; | |
+perl -w -s $pipelineScriptDir/lastz_analyser.WITH_REVCOMP.pl -reference_fasta_file=$sample"-ref.fasta" -sample_fasta_file=assembly/$sample".contigs.fasta" -lastz_results_file="assembly/"$sample".contigs-vs-bestref.lav" -cutoff=50000 -with_revcomp=yes -output=temp/$sample".lastz_analysed_file" -log_file="../logs/"$sample"_lastz_analyser.log" ; | |
## map reads to contigs to work out which sequences to choose when contigs overlap | |
smalt index -k 15 -s 3 "assembly/"$sample".contigs.k15_s3" "assembly/"$sample".contigs.fasta" ; | |
-smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample".contigs.bam" "assembly/"$sample".contigs.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; | |
-samtools sort -@ 8 -T temp_sort -o "temp/"$sample".contigs.sorted.bam" "temp/"$sample".contigs.bam" | |
+smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample".contigs.bam" "assembly/"$sample".contigs.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; | |
+samtools sort -@ $cores -T temp_sort -o "temp/"$sample".contigs.sorted.bam" "temp/"$sample".contigs.bam" | |
samtools index "temp/"$sample".contigs.sorted.bam" | |
samtools mpileup -f "assembly/"$sample".contigs.fasta" -d 1000000 -o "temp/"$sample".contigs.mpileup" "temp/"$sample".contigs.sorted.bam"; | |
awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample".contigs.mpileup" > $sample".contigs.fake_mpileup" ; | |
-perl -w -s /path/to/pipeline/scripts/genome_maker2b.pl -sample_pileup_file="temp/"$sample".contigs.mpileup" -contigs="assembly/"$sample".contigs.fasta" -reference_mapped_consensus=$sample"-ref.fasta" -lastz_analysed_file="temp/"$sample".lastz_analysed_file" -ref_correct_start=0 -ref_correct_stop=20000 -output="temp/"$sample"-genome.fasta" -logfile="../logs/"$sample"_genome_maker.log" ; | |
+perl -w -s $pipelineScriptDir/genome_maker2b.pl -sample_pileup_file="temp/"$sample".contigs.mpileup" -contigs="assembly/"$sample".contigs.fasta" -reference_mapped_consensus=$sample"-ref.fasta" -lastz_analysed_file="temp/"$sample".lastz_analysed_file" -ref_correct_start=0 -ref_correct_stop=20000 -output="temp/"$sample"-genome.fasta" -logfile="../logs/"$sample"_genome_maker.log" ; | |
rm temp/*.bam ; | |
rm temp/*.mpileup ; | |
##now we've made the draft genome, time to map all the reads onto it and make the first consensus sequence... | |
smalt index -k 15 -s 3 "temp/"$sample"-genome.k15_s3" "temp/"$sample"-genome.fasta" ; | |
-smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample"-genome.bam" "temp/"$sample"-genome.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; | |
-samtools sort -@ 8 -T temp_sort -o "temp/"$sample"-genome.sorted.bam" "temp/"$sample"-genome.bam" | |
+smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample"-genome.bam" "temp/"$sample"-genome.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; | |
+samtools sort -@ $cores -T temp_sort -o "temp/"$sample"-genome.sorted.bam" "temp/"$sample"-genome.bam" | |
samtools index "temp/"$sample"-genome.sorted.bam" | |
samtools mpileup -f "temp/"$sample"-genome.fasta" -d 1000000 -o "temp/"$sample"-genome.mpileup" "temp/"$sample"-genome.sorted.bam"; | |
awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample"-genome.mpileup" > "temp/"$sample"-genome.fake_mpileup" ; | |
rm temp/*.bam; | |
-perl -w -s /path/to/pipeline/scripts/cons_mv.pl -mpileup="temp/"$sample"-genome.mpileup" -reference_fasta="temp/"$sample"-genome.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus1.preNcut.fasta" -mv_out="temp/"$sample"-genome.fasta.mv" -base_freq_out="temp/"$sample"-genome.fasta.basefreqs.tsv" ; | |
-perl -w -s /path/to/pipeline/scripts/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus1.preNcut.fasta" > "temp/"$sample".consensus1.fasta"; | |
+perl -w -s $pipelineScriptDir/cons_mv.pl -mpileup="temp/"$sample"-genome.mpileup" -reference_fasta="temp/"$sample"-genome.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus1.preNcut.fasta" -mv_out="temp/"$sample"-genome.fasta.mv" -base_freq_out="temp/"$sample"-genome.fasta.basefreqs.tsv" ; | |
+perl -w -s $pipelineScriptDir/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus1.preNcut.fasta" > "temp/"$sample".consensus1.fasta"; | |
rm temp/*.mpileup; | |
##experience shows that one round of iteration is rarely enough so do one more... | |
smalt index -k 15 -s 3 "temp/"$sample".consensus1.k15_s3" "temp/"$sample".consensus1.fasta" ; | |
-smalt map -x -y 0.5 -i 500 -n 8 -f bam -o "temp/"$sample".consensus1.bam" "temp/"$sample".consensus1.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; | |
-samtools sort -@ 8 -T temp_sort -o "temp/"$sample".consensus1.sorted.bam" "temp/"$sample".consensus1.bam" ; | |
+smalt map -x -y 0.5 -i 500 -n $cores -f bam -o "temp/"$sample".consensus1.bam" "temp/"$sample".consensus1.k15_s3" $sample"_filtered_1.fastq" $sample"_filtered_2.fastq" ; | |
+samtools sort -@ $cores -T temp_sort -o "temp/"$sample".consensus1.sorted.bam" "temp/"$sample".consensus1.bam" ; | |
samtools index "temp/"$sample".consensus1.sorted.bam" ; | |
samtools mpileup -f "temp/"$sample".consensus1.fasta" -d 1000000 -o "temp/"$sample".consensus1.mpileup" "temp/"$sample".consensus1.sorted.bam"; | |
awk {'print $1"\t"$2"\t"$3"\t"$4'} "temp/"$sample".consensus1.mpileup" > "temp/"$sample".consensus1.fake_mpileup" ; | |
rm temp/*.bam; | |
-perl -w -s /path/to/pipeline/scripts/cons_mv.pl -mpileup="temp/"$sample".consensus1.mpileup" -reference_fasta="temp/"$sample".consensus1.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus2.preNcut.fasta" -mv_out="temp/"$sample".consensus1.fasta.mv" -base_freq_out="temp/"$sample".consensus1.fasta.basefreqs.tsv" ; | |
-perl -w -s /path/to/pipeline/scripts/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus2.preNcut.fasta" > "temp/"$sample".consensus2.fasta"; | |
+perl -w -s $pipelineScriptDir/cons_mv.pl -mpileup="temp/"$sample".consensus1.mpileup" -reference_fasta="temp/"$sample".consensus1.fasta" -mv_freq_cutoff=0.01 -mv_overall_depth_cutoff=100 -mv_variant_depth_cutoff=20 -cons_depth_cutoff=80 -sliding_window_size=300 -consensus_out="temp/"$sample".consensus2.preNcut.fasta" -mv_out="temp/"$sample".consensus1.fasta.mv" -base_freq_out="temp/"$sample".consensus1.fasta.basefreqs.tsv" ; | |
+perl -w -s $pipelineScriptDir/N_remover_from_consensus.pl -cutoff=46 "temp/"$sample".consensus2.preNcut.fasta" > "temp/"$sample".consensus2.fasta"; | |
rm temp/*.mpileup; | |
-perl -w -s /path/to/pipeline/scripts/majvarcheck2.pl -mvpath="temp/"$sample".consensus1.fasta.mv" -basefreq="temp/"$sample".consensus1.fasta.basefreqs.tsv" -fwdreads=$sample"_filtered_1.fastq" -revreads=$sample"_filtered_2.fastq"; | |
+perl -w -s $pipelineScriptDir/majvarcheck2.pl -mvpath="temp/"$sample".consensus1.fasta.mv" -basefreq="temp/"$sample".consensus1.fasta.basefreqs.tsv" -fwdreads=$sample"_filtered_1.fastq" -revreads=$sample"_filtered_2.fastq"; | |
### All done - now copy results from working directory to results directory | |
### This is essential when running a job on the cluster, otherwise can do at your | |
--- majvarcheck2.pl.orig 2016-12-02 15:10:03.000000000 -0500 | |
+++ majvarcheck2.pl 2017-06-22 13:18:44.720066194 -0400 | |
@@ -75,7 +75,7 @@ | |
system("smalt index -k 15 -s 3 $smalt_index $consensus_fasta"); | |
system("smalt map -x -y 0.5 -i 500 -n 8 -o $sam $smalt_index $fwdreads $revreads"); | |
system("samtools view -@ 8 -bh -S $sam > $bam"); | |
- system("samtools sort -@ 8 $bam $sorted"); | |
+ system("samtools sort -@ 8 -T $bam.tmpsort -o $sorted_bam $bam"); | |
system("samtools faidx $consensus_fasta"); | |
system("samtools mpileup -f $consensus_fasta $sorted_bam -d 1000000 > $pileup"); | |
$fake_mpileup = $pileup; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment