Skip to content

Instantly share code, notes, and snippets.

@PoisonAlien
Created February 11, 2020 16:21
Show Gist options
  • Save PoisonAlien/786c3f92eeb64a0bfd1fefae85978d92 to your computer and use it in GitHub Desktop.
Save PoisonAlien/786c3f92eeb64a0bfd1fefae85978d92 to your computer and use it in GitHub Desktop.
a command line toolkit for WGS/WXS data processing
#!/bin/bash
#----------------------------------------- Required files and binaries------------------------------------------------------------------------------
#for fq2bam
GATK="/home/csipk/NGS/gatk-protected-local.jar"
REGIONS="/usr/share/ref_genomes/hg19/agilent_v5_v3_merged_baits.bed"
REFFILE="/usr/share/ref_genomes/hg19/hg19.fa"
# one can obtain these from gatk ftp (knonw as gatk resource bundle)
millsIndels="/usr/share/ref_genomes/hg19/gatk_resource_bundle/Mills_and_1000G_gold_standard.indels.hg19.vcf"
KGIndels="/usr/share/ref_genomes/hg19/gatk_resource_bundle/1000G_phase1.indels.hg19.vcf"
dbSNP138="/usr/share/ref_genomes/hg19/gatk_resource_bundle/dbSNP_138.vcf"
#for vScan
VARSCAN="/home/csipk/NGS/VarScan/VarScan.v2.4.2.jar"
FPFILTER="/home/csipk/NGS/VarScan/fpfilter.pl" #Get it from here https://github.com/ckandoth/variant-filter
INDELPARSER="/home/csipk/NGS/VarScan/varscan_accessories/vs_indel_to_annovar.py" #Get it from here https://github.com/PoisonAlien/varscan_accessories
#for cnvScan
REFDICT="/usr/share/ref_genomes/hg19/hg19.dict"
#*************************************************************************************************************************************
function usage() {
echo "$(tput setaf 3)
---------------------------------------------------------------------------------------------------------------------------------------------------
usage : somatictoolkit <command> [options]
somatictoolkit - A set of optmized protocols for WXS data processing from cancer studies.
command: fq2bam aligns paired-end fastq files from WXS using bwa mem and processes them according to GATK best practices.
vScan wrapper around VarScan somatic command. Parameteres are hard coded to optmized settings which we found to work the best.
makePon Creates a Panel of Normals (PoNs) for Copy Number Analysis.
cnvScan Wraper around GATK4 copy-number protocol using PoNs to denoise.
author : anand mayakonda <[email protected]>
version: 1.0.0
---------------------------------------------------------------------------------------------------------------------------------------------------
"
tput sgr0
}
if [ $# -lt 1 ];then
usage
exit
fi
#*************************************************************************************************************************************
function fq2bam_usage() {
echo "$(tput setaf 3)
---------------------------------------------------------------------------------------------------------------------------------------------------
usage: somatictoolkit fq2bam [-R] [-t] [-D] output_fn Reference fwd.fq rev.fq
aligns paired-end fastq files from WXS using bwa mem and processes them according to GATK best practices.
bwa-mem -> samblaster (on the fly duplicate marking) -> BQSR -> analysis_ready_bam
Indel re-alignment has been deprecated.
positional arguments:
output_fn Basename for output file. Ususally sample name.
bwaidx Reference fasta file indexed with bwa.
fwd.fq Forward reads
rev.fq Reverse reads
optional arguments:
-D Output directory to store results. Optional. Default ./bams
-R Read Group of format '@RG\tID:foo\tSM:bar'. Default generates from output_fn.
-t threads to use. Default 4.
Example: fq2bam foo hg19.fa foo_1.fq.gz foo_2.fq.gz
---------------------------------------------------------------------------------------------------------------------------------------------------
"
tput sgr0
}
#*************************************************************************************************************************************
function vScan_usage() {
echo "$(tput setaf 3)
---------------------------------------------------------------------------------------------------------------------------------------------------
usage: somatictoolkit vScan [-o] [-D] tumor normal
wrapper around VarScan somatic command. Parameteres are hard coded to values which we found to work the best.
Protocol: Varscan somatic -> Varscan processSomatic -> bam-readcount -> fpFilter -> analysis_ready_variants
Output : maflite (MAF with necessary fields required for annotation), which can be directly passed to maf2maf for variant annotation via VEP.
positional arguments:
tumor Tumor BAM file. Required.
normal Normal BAM file. Required.
optional arguments:
-o basename for output file. Optional. Default parses from tumor BAM file.
-D Output directory to store results. Optional. Default ./vScan_op
Example: vScan.sh tumor.bam normal.bam
---------------------------------------------------------------------------------------------------------------------------------------------------
"
tput sgr0
}
#*************************************************************************************************************************************
function makePon_usage (){
echo "$(tput setaf 3)
---------------------------------------------------------------------------------------------------------------------------------------------------
usage: somatictoolkit makePon [output_fn] [output_dir] normal_bam_list
positional arguments:
normal_bam_list file containing paths to bam files (one bam file per line)
optional arguments:
-o basename for output file. Optional. Default PoN
-D Output directory to store results. Optional. Default ./cnvScan/PoN/
-R Exome bait file. 3 column BED file.
Example: somatictoolkit makePon normal_bams_list.tsv
# normal_bams_list.tsv must contain file paths to bam files (one bam file per line) in the following format:
# normal1.bam
# normal2.bam
# ...
---------------------------------------------------------------------------------------------------------------------------------------------------
"
tput sgr0
}
#*************************************************************************************************************************************
function cnvScan_usage (){
echo "$(tput setaf 3)
---------------------------------------------------------------------------------------------------------------------------------------------------
Program: cnvScan - Wraper around GATK4 tumor-normal pair Allelic Copy Number (ACNV) protocol. Requires GATK4
positional arguments:
pon Panel of Normal file; .pon file created from makePon. Required
tumor Tumor BAM file. Required.
optional arguments:
-o basename for output files. Optional. Default parses from tumor BAM file.
-D Output directory to store results. Optional. Default ./cnvScan/
Example: cnvScan.sh [-o] [-D] <PoN.pon> <tumor.bam>
---------------------------------------------------------------------------------------------------------------------------------------------------
"
tput sgr0
}
#*************************************************************************************************************************************
function setPaths() {
echo "$(tput setaf 3)
Set paths to GATK jar file, resource bundle and exome bait regions from line 7 to 13 of this script.
GATK jar file: https://software.broadinstitute.org/gatk/download/
Download GATK resource bundle for your reference build from below ftp with your favourite ftp client (e.g; filezilla):
Host : ftp.broadinstitute.org
Username: gsapubftp-anonymous
"
tput sgr0
}
#*************************************************************************************************************************************
function fq2bam_check() {
samb=`which samblaster || true`
if [ ! -f "$samb" ];then
echo "$(tput setaf 1)samblaster not found. Get it from here: https://github.com/GregoryFaust/samblaster"
tput sgr0
exit
fi
samt=`which samtools || true`
if [ ! -f "$samt" ];then
echo "$(tput setaf 1)samtools not found. Get it from here: http://www.htslib.org/download/"
tput sgr0
exit
fi
if [ ! -f "$GATK" ];then
echo "GATK $GATK file does not exists!"
setPaths
exit
fi
if [ ! -f "$millsIndels" ];then
setPaths
echo "$millsIndels file does not exists!"
exit
fi
if [ ! -f "$KGIndels" ];then
setPaths
echo "$(tput setaf 3)$KGIndels does not exists!"
tput sgr0
exit
fi
if [ ! -f "$dbSNP138" ];then
setPaths
echo "$dbSNP138 does not exists!"
exit
fi
if [ ! -f "$REGIONS" ];then
echo "$(tput setaf 3)$REGIONS does not exists!"
tput sgr0
exit
fi
}
#*************************************************************************************************************************************
function vscan_check() {
#Check if samtools is installed
samt=`which samtools || true`
if [ ! -f "$samt" ];then
echo "$(tput setaf 1)samtools not found. Try: sudo apt-get install samtools"
tput sgr0
exit
fi
#Check if bam-readcount is installed
bamrc=`which bam-readcount || true`
if [ ! -f "$bamrc" ];then
echo "$(tput setaf 1)bam-readcount not found. Make sure its installed and under path. Get it from here: https://github.com/genome/bam-readcount"
tput sgr0
exit
fi
if [ ! -f "$REFFILE" ];then
echo "$(tput setaf 3)$REFFILE does not exists! Set correct path to reference fasta at line 9 of this script."
tput sgr0
exit
fi
if [ ! -f "$VARSCAN" ];then
echo "$(tput setaf 3)$VARSCAN does not exists! Set correct path to VarScan jar file at line 16-18 of this script."
tput sgr0
exit
fi
if [ ! -f "$FPFILTER" ];then
echo "$(tput setaf 3)$FPFILTER does not exists! Get it from here https://github.com/ckandoth/variant-filter and set correct path at line 16-18 of this script."
tput sgr0
exit
fi
if [ ! -f "$INDELPARSER" ];then
echo "$(tput setaf 3)$INDELPARSER does not exists! Get it from here https://github.com/PoisonAlien/varscan_accessories and set correct path at line 16-18 of this script."
tput sgr0
exit
fi
}
#*************************************************************************************************************************************
function createPon_check() {
if [ ! -f "$REFFILE" ];then
echo "$(tput setaf 3)$REFFILE file does not exists! Set correct path to reference fasta at line 46 of this script."
tput sgr0
exit
fi
if [ ! -f "$GATK" ];then
echo "$(tput setaf 3)$GATK does not exists! Set correct path to GATK4 jar file at line 47 of this script."
tput sgr0
exit
fi
}
#*************************************************************************************************************************************
function fq2bam() {
if test -z "$2"
then
fq2bam_usage
echo "$2"
exit 1
fi
fq2bam_check
output_dir="./bams/"
threads=4
dt=`date --iso-8601`
rdgrp=""
while getopts ":hw:D:R:t:" OPTION
do
case "${OPTION}" in
h)
fq2bam_usage
exit 1
;;
D)
output_dir="$OPTARG"
;;
R)
rdgrp="$OPTARG"
;;
t)
threads="$OPTARG"
;;
?)
unkOpt="$OPTARG"
fq2bam_usage
echo -e "Unknown option: ${unkOpt}"
exit
;;
esac
done
output_fn="${@:${OPTIND}:1}"
bwaidx="${@:$((${OPTIND}+1)):1}"
fq1="${@:$((${OPTIND}+2)):1}"
fq2="${@:$((${OPTIND}+3)):1}"
if [[ -z "$rdgrp" ]]
then
rdgrp="\"@RG\tID:${output_fn}\tSM:${output_fn}\tLB:${output_fn}\tPL:illumina\tPU:Unknown\tDT:${dt}\""
fi
if [[ -z "$output_fn" ]]
then
fq2bam_usage
echo -e "Please provide a basename for output file. Usally sample name. \n"
exit 0
elif [[ -z "$fq1" ]] || [[ ! -f "$fq1" ]]
then
fq2bam_usage
echo -e "forward fastq $fq1 does not exists!"
exit 0
elif [[ -z "$fq2" ]] || [[ ! -f "$fq2" ]]
then
fq2bam_usage
echo -e "reverse fastq $fq2 does not exists!"
exit 0
elif [[ -z "$bwaidx" ]] || [[ ! -f "$bwaidx" ]]
then
fq2bam_usage
echo -e "$bwaidx index does not exists"
exit 0
fi
mkdir -p ${output_dir}
echo -e "$(tput setaf 3)$(date): Aligning and Removing duplicates.. \n"
algnCmd="bwa mem -M -v 1 -T 0 -R ${rdgrp} -t ${threads} ${bwaidx} ${fq1} ${fq2} 2> ${output_dir}/${output_fn}.bwa.log | samblaster -M 2> ${output_dir}/${output_fn}.samblaster.log | samtools view -@ ${threads} -bS - | samtools sort -@ ${threads} -o ${output_dir}/${output_fn}.raw.bam -"
eval $algnCmd
cat ${output_dir}/${output_fn}.samblaster.log
echo -e "\n$(tput setaf 3)$(date): Indexing raw BAM.. \n"
samtools index -@ ${threads} ${output_dir}/${output_fn}.raw.bam
cat ${REGIONS} | grep -v "^#" | awk '{print $1":"$2"-"$3}' > ${output_dir}/${output_fn}.exome.intervals
echo -e "$(tput setaf 3)$(date): Performing base quality recalibration.. \n"
echo -e "----------------------------------------------------------------"
bqsrCmd="java -d64 -Xmx4g -jar ${GATK} BaseRecalibrator --interval_padding 250 --intervals ${output_dir}/${output_fn}.exome.intervals --secondsBetweenProgressUpdates 30.0 --reference ${REFFILE} \
--output ${output_dir}/${output_fn}.recal.table --knownSites ${dbSNP138} --knownSites ${KGIndels} --knownSites ${millsIndels} \
--input ${output_dir}/${output_fn}.raw.bam > ${output_dir}/${output_fn}.bqsr.log"
eval ${bqsrCmd}
echo -e "----------------------------------------------------------------"
echo -e "\n$(tput setaf 3)$(date): Printing final BAM file.. \n"
echo -e "---------------------------------------------------"
applyCmd="java -d64 -Xmx4g -jar ${GATK} ApplyBQSR --input ${output_dir}/${output_fn}.raw.bam --output ${output_dir}/${output_fn}.bam
--bqsr_recal_file ${output_dir}/${output_fn}.recal.table --interval_padding 250 --intervals ${output_dir}/${output_fn}.exome.intervals > ${output_dir}/${output_fn}.ApplyBQSR.log"
eval ${applyCmd}
echo -e "---------------------------------------------------"
echo -e "\n$(tput setaf 3)$(date): Removing intermediate files and cleaning.. \n"
rm ${output_dir}/${output_fn}.raw.bam
rm ${output_dir}/${output_fn}.raw.bam.bai
rm ${output_dir}/${output_fn}.exome.intervals
echo -e "$(tput setaf 3)$(date): DONE \n"
}
#*************************************************************************************************************************************
function vScan() {
if test -z "$2"
then
vScan_usage
exit 1
fi
#Default values for arguments
output_fn=""
output_dir=""
while getopts ":hw:D:o:" OPTION
do
case "${OPTION}" in
h)
vScan_usage
exit 1
;;
D)
output_dir="$OPTARG"
;;
o)
output_fn="$OPTARG"
;;
?)
unkOpt="$OPTARG"
vScan_usage
echo -e "Unknown option: ${unkOpt}"
exit
;;
esac
done
vscan_check
#positional arguments
tumor="${@:${OPTIND}:1}"
normal="${@:$((${OPTIND}+1)):1}"
if [[ -z "$tumor" ]]
then
vScan_usage
echo -e "Missing positional arguments. \n"
exit 0
elif [[ -z "$tumor" ]] || [[ ! -f "$tumor" ]]
then
vScan_usage
echo -e "$tumor BAM does not exists!"
exit 0
elif [[ -z "$normal" ]] || [[ ! -f "$normal" ]]
then
vScan_usage
echo -e "$normal BAM does not exists!"
exit 0
fi
if [[ -z "$output_fn" ]]
then
output_fn=`basename $tumor | sed 's/.bam$//I'`
fi
if [[ -z "$output_dir" ]]
then
output_dir="./varscan_op/$output_fn/"
fi
#------------------Run VarScan commands -----------------------------------
mkdir -p $output_dir"/raw_somatic_op/"
echo -e "#Main command:" > $output_dir"/$output_fn.commands.log"
echo -e "vScan.sh $tumor $normal $output_fn $output_dir\n" >> $output_dir"/$output_fn.commands.log"
echo -e "$(tput setaf 3)$(date): Running Varscan Somatic.. \n"
tput sgr0
vsCmd="samtools mpileup -B -f $REFFILE -q 10 -L 10000 -d 10000 $normal $tumor |
java -Xmx4g -d64 -jar $VARSCAN somatic -mpileup $output_dir"/raw_somatic_op/$output_fn" --min-coverage-normal 10 --min-coverage-tumor 14 --min-var-freq 0.02 --strand-filter 1"
echo -e "#Internal sub commands at each step:" >> $output_dir"/$output_fn.commands.log"
echo -e "#VarScan somatic command" >> $output_dir"/$output_fn.commands.log"
echo -e $vsCmd >> $output_dir"/$output_fn.commands.log"
eval $vsCmd
echo -e "$(tput setaf 3)\n$(date): Running Varscan processSomatic for SNPs.. \n"
tput sgr0
vsPsSNPcmd="java -jar $VARSCAN processSomatic $output_dir"/raw_somatic_op/$output_fn.snp" --min-tumor-freq 0.03 --max-normal-freq 0.01 --p-value 0.05"
echo -e "\n#VarScan processSomatic command" >> $output_dir"/$output_fn.commands.log"
echo -e $vsPsSNPcmd >> $output_dir"/$output_fn.commands.log"
eval $vsPsSNPcmd
echo -e "$(tput setaf 3)\n$(date): Running Varscan processSomatic for INDELS.. \n"
tput sgr0
vsPsINDELcmd="java -jar $VARSCAN processSomatic $output_dir"/raw_somatic_op/$output_fn.indel" --min-tumor-freq 0.03 --max-normal-freq 0.01 --p-value 0.01"
echo -e $vsPsINDELcmd >> $output_dir"/$output_fn.commands.log"
eval $vsPsINDELcmd
#------------------ Move files into clean directories.----------------------------------------------------
mkdir -p $output_dir"/procesSomatic_op/SNP/"
mkdir -p $output_dir"/procesSomatic_op/SNP/LOH/"
mkdir -p $output_dir"/procesSomatic_op/SNP/Germline/"
mkdir -p $output_dir"/procesSomatic_op/SNP/Somatic/"
mv $output_dir"/raw_somatic_op/$output_fn.snp.LOH"* $output_dir"/procesSomatic_op/SNP/LOH/"
mv $output_dir"/raw_somatic_op/$output_fn.snp.Germline"* $output_dir"/procesSomatic_op/SNP/Germline/"
mv $output_dir"/raw_somatic_op/$output_fn.snp.Somatic"* $output_dir"/procesSomatic_op/SNP/Somatic/"
mkdir -p $output_dir"/procesSomatic_op/INDEL/"
mkdir -p $output_dir"/procesSomatic_op/INDEL/LOH/"
mkdir -p $output_dir"/procesSomatic_op/INDEL/Germline/"
mkdir -p $output_dir"/procesSomatic_op/INDEL/Somatic/"
mv $output_dir"/raw_somatic_op/$output_fn.indel.LOH"* $output_dir"/procesSomatic_op/INDEL/LOH/"
mv $output_dir"/raw_somatic_op/$output_fn.indel.Germline"* $output_dir"/procesSomatic_op/INDEL/Germline/"
mv $output_dir"/raw_somatic_op/$output_fn.indel.Somatic"* $output_dir"/procesSomatic_op/INDEL/Somatic/"
#------------------ Process INDELS--------------------------------------------------------------------------
echo -e "$(tput setaf 3)\n$(date): Preparing INDEL file for VEP annotation.. \n"
tput sgr0
echo -e "Chromosome\tStart_Position\tReference_Allele\tTumor_Seq_Allele2\tt_depth\tt_vaf\tTumor_Sample_Barcode\tCenter\tVerification_Status\tValidation_Status\tMutation_Status\tValidation_Method\tBAM_File\tSequencer" > $output_dir"/procesSomatic_op/INDEL/Somatic/$output_fn.indel.Somatic.hc.mafLite"
indelCmd="python $INDELPARSER --min-var-count 4 --min-depth 14 --min-var-frac 0.03 --min-depth-normal 10 $output_dir"/procesSomatic_op/INDEL/Somatic/$output_fn.indel.Somatic.hc""
echo -e "\n#Preparing INDEL file for VEP annotation" >> $output_dir"/$output_fn.commands.log"
echo -e $indelCmd >> $output_dir"/$output_fn.commands.log"
eval $indelCmd | awk -v tsb="$output_fn" -v bam="$tumor" '{OFS="\t"; print $1,$2+1,$4,$5,$9+$10,$11/100,tsb,"CSI-NUS","Unknown","Untested","Somatic","No",bam,"Illumina HiSeq"}' >> $output_dir"/procesSomatic_op/INDEL/Somatic/$output_fn.indel.Somatic.hc.mafLite" #+1 to loci to void issues with zero based cordinates from Indels
#------------------ Process SNPS--------------------------------------------------------------------------
echo -e "$(tput setaf 3)$(date): Filtering SNPs for false-positives.. \n"
tput sgr0
mkdir -p $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/"
sed 1d $output_dir"/procesSomatic_op/SNP/Somatic/$output_fn.snp.Somatic.hc" | awk '{OFS="\t"; print $1,$2,$2,$3,$4}' > $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.var"
sed 1d $output_dir"/procesSomatic_op/SNP/Somatic/$output_fn.snp.Somatic.hc" | awk '{OFS="\t"; print $1,$2,$3,$4}' > $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.varFile"
echo -e "$(tput setaf 3)$(date): Extracting read-counts for high confident Somatic SNPs.. \n"
tput sgr0
bamrcCmd="bam-readcount -q10 -w1 -b10 -l $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.var" -f $REFFILE $tumor > $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.readcounts""
echo -e "\n#Bam-readcount command" >> $output_dir"/$output_fn.commands.log"
echo -e $bamrcCmd >> $output_dir"/$output_fn.commands.log"
eval $bamrcCmd
echo -e "$(tput setaf 3)\n$(date): Running fpfilter.. \n"
tput sgr0
fpFilterCmd="perl $FPFILTER --var-file $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.varFile" --readcount-file $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.readcounts" --min-depth 14 --min-read-pos 0.1 --min-strandedness 0.01 --max-mmqs-diff 90 --min-var-count 4 --min-var-frac 0.03 --min-var-dist-3 0.1 --max-var-mmqs 100 --output-file $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter""
echo -e "\n#FpFilter command" >> $output_dir"/$output_fn.commands.log"
echo -e $fpFilterCmd >> $output_dir"/$output_fn.commands.log"
eval $fpFilterCmd
echo -e "Chromosome\tStart_Position\tReference_Allele\tTumor_Seq_Allele2\tt_depth\tt_vaf\tTumor_Sample_Barcode\tCenter\tVerification_Status\tValidation_Status\tMutation_Status\tValidation_Method\tBAM_File\tSequencer" > $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter.mafLite"
grep 'PASS' $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter" | awk -v tsb="$output_fn" -v bam="$tumor" '{OFS="\t" ; print $1,$2,$3,$4,$5,$7,tsb,"CSI-NUS","Unknown","Untested","Somatic","No",bam,"Illumina HiSeq"}' >> $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter.mafLite"
echo -e "\n#Only PASS variants from fpFilter output are used as final results for SNPs" >> $output_dir"/$output_fn.commands.log"
#mkdir -p $output_dir"final_vars"
echo -e "Chromosome\tStart_Position\tReference_Allele\tTumor_Seq_Allele2\tt_depth\tt_vaf\tTumor_Sample_Barcode\tCenter\tVerification_Status\tValidation_Status\tMutation_Status\tValidation_Method\tBAM_File\tSequencer" > $output_dir"/$output_fn.somatic.mafLite"
#------------------ Write final Somatic MAF--------------------------------------------------------------------------
cat <(sed 1d "$output_dir/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter.mafLite") <(sed 1d "$output_dir/procesSomatic_op/INDEL/Somatic/$output_fn.indel.Somatic.hc.mafLite") | sed 's/^chr//' >> $output_dir"/$output_fn.somatic.mafLite"
vars=`sed 1d $output_dir"/$output_fn.somatic.mafLite" | wc -l`
echo -e "$(tput setaf 3)\n$(date): Number of high confidence, false positive filtered variants (INDELS are tricky): $vars [$output_dir"/$output_fn.somatic.mafLite"]"
echo -e "$(tput setaf 3)\n$(date): Done!\n"
tput sgr0
}
#*************************************************************************************************************************************
function makePon() {
if test -z "$1"
then
makePon_usage
exit 1
fi
output_fn=""
output_dir=""
while getopts ":hw:D:R:o:" OPTION
do
case "${OPTION}" in
h)
makePon_usage
exit 1
;;
D)
output_dir="$OPTARG"
;;
o)
output_fn="$OPTARG"
;;
R)
REGIONS="$OPTARG"
;;
?)
unkOpt="$OPTARG"
makePon_usage
echo -e "Unknown option: ${unkOpt}"
exit
;;
esac
done
#Default values for arguments
if [[ -z "$output_fn" ]]
then
output_fn="PoN"
fi
if [[ -z "$output_dir" ]]
then
output_dir="./cnvScan/$output_fn/"
fi
createPon_check
if [ ! -f "$REGIONS" ];then
echo "$(tput setaf 3)Exome baits $REGIONS does not exists! Set correct path at line 8 of this script or provide one with argument -R."
tput sgr0
exit
fi
if [ ! -f "$REFDICT" ];then
echo "$(tput setaf 3)Referece dictionary $REFDICT does not exists! Set correct path to Reference dict file at line 21 of this script. Cerating one for now.."
tput sgr0
$REFDICT=$REFFILE.dict
java -jar $GATK CreateSequenceDictionary -R $REFFILE --output $REFDICT --QUIET true
fi
#positional arguments
normal="${*: -1:1}"
#echo -e "$output_fn $output_dir $REGIONS $REFDICT $normal"
#exit
mkdir -p $output_dir
#Make a tragets file from bait BED file
echo -e "contig\tstart\tstop\tname" > $output_dir/$output_fn".targets"
grep -v "^#" $REGIONS | sort -V -k1,2 | cut -f 1-3 | awk '{OFS="\t"; print $0,"target_"NR}' >> $output_dir/$output_fn".targets"
echo -e "$(tput setaf 3)\n$(date): Collecting target coverage for all normal samples.. \n"
touch $output_dir/$output_fn.colCov.temp.txt
cat $normal | grep -v "^#" | while read line
do
if [ ! -f "$line" ]
then
echo "$line does not exists! Skipping.."
else
fbn=`basename $line | sed 's/.bam//I'`
echo -e "java -Xmx4g -d64 -jar ${GATK} CalculateTargetCoverage --input $line --reference $REFFILE --targets $output_dir/$output_fn.targets --groupBy SAMPLE --transform PCOV --targetInformationColumns FULL --interval_set_rule UNION --interval_padding 250 --secondsBetweenProgressUpdates 30.0 --disableToolDefaultReadFilters false --disableSequenceDictionaryValidation true --disableReadFilter NotDuplicateReadFilter --output $output_dir/$fbn.coverage.tsv" >> $output_dir/$output_fn.colCov.temp.txt
fi
done
tput sgr0
cat $output_dir/$output_fn.colCov.temp.txt | parallel -j 20 --progress > $output_dir/$output_fn.colCov.log
rm $output_dir/$output_fn.colCov.temp.txt
ls $output_dir | grep .coverage.tsv$ | awk -v var="$output_dir/" '{print var$0}' > $output_dir/$output_fn.coverageFileList.tsv
echo -e "$(tput setaf 3)\n$(date): Merging per-sample coverage info into single file.. \n"
java -d64 -Xmx4g -jar $GATK CombineReadCounts \
--inputList $output_dir/$output_fn.coverageFileList.tsv \
--maxOpenFiles 100 \
--output $output_dir/$output_fn.tsv > $output_dir/$output_fn.CombineReadCounts.log
echo -e "$(tput setaf 3)\n$(date): Creating targets with GC annotations.. \n"
java -Xmx4g -jar $GATK AnnotateTargets \
--targets $output_dir/$output_fn".targets" \
--reference $REFFILE \
--output $output_dir/$output_fn".annotated.tsv" > $output_dir/$output_fn.AnnotateTargets.log
echo -e "$(tput setaf 3)\n$(date): Correcting coverage profile for sample-specific GC bias.. \n"
java -Xmx4g -jar $GATK CorrectGCBias \
--input $output_dir/$output_fn.tsv \
--targets $output_dir/$output_fn".annotated.tsv" \
--output $output_dir/$output_fn.gc_corrected.tsv > $output_dir/$output_fn.CorrectGCBias.log
echo -e "$(tput setaf 3)\n$(date): Create panel of normals from combined, GC-corrected coverage profiles.. \n"
java -Xmx4g -jar $GATK CreatePanelOfNormals \
--input $output_dir/$output_fn.gc_corrected.tsv \
--extremeColumnMedianCountPercentileThreshold 2.5 \
--truncatePercentileThreshold 0.1 \
--noQC false \
--output $output_dir/$output_fn.pon > $output_dir/$output_fn.CreatePanelOfNormals.log
echo -e "$(tput setaf 3)\n$(date): DONE!"
tput sgr0
}
#*************************************************************************************************************************************
function cnvScan() {
if test -z "$1"
then
cnvScan_usage
exit 1
fi
output_fn=""
output_dir=""
while getopts ":hw:D:o:" OPTION
do
case "${OPTION}" in
h)
cnvScan_usage
exit 1
;;
D)
output_dir="$OPTARG"
;;
o)
output_fn="$OPTARG"
;;
?)
unkOpt="$OPTARG"
makePon_usage
echo -e "Unknown option: ${unkOpt}"
exit
;;
esac
done
#positional arguments
pon="${@:${OPTIND}:1}"
tumor="${@:$((${OPTIND}+1)):1}"
if [[ -z "$pon" ]] || [[ ! -f "$pon" ]]
then
cnvScan_usage
echo -e "Panel of Normal file $pon does not exists"
exit
fi
if [[ -z "$tumor" ]] || [[ ! -f "$tumor" ]]
then
cnvScan_usage
echo -e "Tumor $tumor does not exists"
exit
fi
#Default values for arguments
if [[ -z "$output_fn" ]]
then
output_fn=`basename $tumor | sed 's/.bam//I'`
fi
if [[ -z "$output_dir" ]]
then
output_dir="./cnvScan/$output_fn"
fi
if [ ! -f "$REFDICT" ];then
echo "$(tput setaf 3)Referece dictionary $REFDICT does not exists! Set correct path to Reference dict file at line 21 of this script. Cerating one for now.."
tput sgr0
$REFDICT=$REFFILE.dict
java -jar $GATK CreateSequenceDictionary -R $REFFILE --output $REFDICT --QUIET true
fi
mkdir -p $output_dir
mkdir -p $output_dir/$output_fn"_log"
#Make a tragets file from bait BED file
echo -e "contig\tstart\tstop\tname" > $output_dir/$output_fn".targets"
grep -v "^#" $REGIONS | sort -V -k1,2 | cut -f 1-3 | awk '{OFS="\t"; print $0,"target_"NR}' >> $output_dir/$output_fn".targets"
#make picard interval list for heterozygous SNPs from normal
#echo "Creating Interval List for heterozygous SNPs"
#cat $snp | sed 1d | sed 's/%//g' | grep -v LOH | awk '{if($7 > 10 && $7 < 75) print $0}' | awk '{if($14 < 1E-2) print $0}' | awk '{OFS="\t" ; if($5+$6 > 25) print $1,$2,$2,$13":"$1":"$2}' > $output_dir/$output_fn".common_sites.bed"
#java -Xmx4G -d64 -jar $GATK BedToIntervalList --input $output_dir/$output_fn".common_sites.bed" --output $output_dir/$output_fn".common_sites.IntervalList" --SEQUENCE_DICTIONARY $REFDICT --QUIET true
echo -e "$(tput setaf 3)\n$(date): Padding 250bp around exome bait regions.. \n"
java -d64 -Xmx1g -jar $GATK PadTargets \
--targets $output_dir/$output_fn".targets" \
--padding 250 \
--output $output_dir/$output_fn".targets.padded.tsv" > $output_dir/$output_fn"_log"/PadTargets.log
echo -e "$(tput setaf 3)\n$(date): Collecting target coverage for $tumor.. \n"
java -d64 -Xmx4g -jar $GATK CalculateTargetCoverage \
--input $tumor \
--reference $REFFILE \
--targets $output_dir/$output_fn".targets.padded.tsv" \
--groupBy SAMPLE \
--transform PCOV \
--targetInformationColumns FULL \
--interval_set_rule UNION \
--interval_padding 0 \
--secondsBetweenProgressUpdates 10.0 \
--disableToolDefaultReadFilters false \
--disableSequenceDictionaryValidation true \
--disableReadFilter NotDuplicateReadFilter \
--output $output_dir/$output_fn.tumor.coverage.tsv > $output_dir/$output_fn"_log"/CalculateTargetCoverage.log
echo -e "$(tput setaf 3)\n$(date): Annotating targets.. \n"
java -Xmx4g -jar $GATK AnnotateTargets \
--targets $output_dir/$output_fn".targets.padded.tsv" \
--reference $REFFILE \
--output $output_dir/$output_fn.annotated.tsv > $output_dir/$output_fn"_log"/AnnotateTargets.log
echo -e "$(tput setaf 3)\n$(date): Correcting for GCBias.. \n"
java -Xmx4g -jar $GATK CorrectGCBias \
--input $output_dir/$output_fn.tumor.coverage.tsv \
--targets $output_dir/$output_fn.annotated.tsv \
--output $output_dir/$output_fn.gc_corrected.tsv > $output_dir/$output_fn"_log"/CorrectGCBias.log
echo -e "$(tput setaf 3)\n$(date): Normalizing Readcounts.. \n"
java -Xmx4g -jar $GATK NormalizeSomaticReadCounts \
--input $output_dir/$output_fn.gc_corrected.tsv \
--targets $output_dir/$output_fn".targets.padded.tsv" \
--panelOfNormals $pon \
--tangentNormalized $output_dir/$output_fn.tn.tsv \
--factorNormalizedOutput $output_dir/$output_fn.fnt.tsv \
--preTangentNormalized $output_dir/$output_fn.preTN.tsv \
--betaHatsOutput $output_dir/$output_fn.betaHats.tsv > $output_dir/$output_fn"_log"/NormalizeSomaticReadCounts.log
echo -e "$(tput setaf 3)\n$(date): Performing Segmentation.. \n"
java -Xmx4g -jar $GATK PerformSegmentation \
--tangentNormalized $output_dir/$output_fn.tn.tsv \
--log2Input true \
--alpha 0.01 \
--eta 0.05 \
--kmax 25 \
--minWidth 2 \
--nmin 200 \
--nperm 10000 \
--pmethod HYBRID \
--trim 0.025 \
--undoPrune 0.05 \
--undoSD 2 \
--undoSplits NONE \
--output $output_dir/$output_fn.seg > $output_dir/$output_fn"_log"/PerformSegmentation.log
echo -e "$(tput setaf 3)\n$(date): Annotating Segments.. \n"
java -Xmx4g -jar $GATK CallSegments \
--tangentNormalized $output_dir/$output_fn.tn.tsv \
--segments $output_dir/$output_fn.seg \
--legacy false \
--output $output_dir/$output_fn.seg.called > $output_dir/$output_fn"_log"/CallSegments.log
echo -e "$(tput setaf 3)\n$(date): Plotting Segments.. \n"
java -Xmx4g -jar $GATK PlotSegmentedCopyRatio \
--tangentNormalized $output_dir/$output_fn.tn.tsv \
--preTangentNormalized $output_dir/$output_fn.preTN.tsv \
--segments $output_dir/$output_fn.seg.called \
-SD $REFDICT \
--output $output_dir \
--outputPrefix $output_fn > $output_dir/$output_fn"_log"/PlotSegmentedCopyRatio.log
echo -e "$(tput setaf 3)\n$(date): DONE! \n"
tput sgr0
}
#*************************************************************************************************************************************
case "$1" in
'fq2bam')
fq2bam "${@:2}"
;;
'vScan')
vScan "${@:2}"
;;
'somatic')
somatic "${@:2}"
;;
'makePon')
makePon "${@:2}"
;;
'cnvScan')
cnvScan "${@:2}"
;;
*)
usage
echo -e "Error: command \"$1\" not recognized\n"
exit 1
esac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment