Created
February 11, 2020 16:21
-
-
Save PoisonAlien/786c3f92eeb64a0bfd1fefae85978d92 to your computer and use it in GitHub Desktop.
a command line toolkit for WGS/WXS data processing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#----------------------------------------- Required files and binaries------------------------------------------------------------------------------ | |
#for fq2bam | |
GATK="/home/csipk/NGS/gatk-protected-local.jar" | |
REGIONS="/usr/share/ref_genomes/hg19/agilent_v5_v3_merged_baits.bed" | |
REFFILE="/usr/share/ref_genomes/hg19/hg19.fa" | |
# one can obtain these from gatk ftp (knonw as gatk resource bundle) | |
millsIndels="/usr/share/ref_genomes/hg19/gatk_resource_bundle/Mills_and_1000G_gold_standard.indels.hg19.vcf" | |
KGIndels="/usr/share/ref_genomes/hg19/gatk_resource_bundle/1000G_phase1.indels.hg19.vcf" | |
dbSNP138="/usr/share/ref_genomes/hg19/gatk_resource_bundle/dbSNP_138.vcf" | |
#for vScan | |
VARSCAN="/home/csipk/NGS/VarScan/VarScan.v2.4.2.jar" | |
FPFILTER="/home/csipk/NGS/VarScan/fpfilter.pl" #Get it from here https://github.com/ckandoth/variant-filter | |
INDELPARSER="/home/csipk/NGS/VarScan/varscan_accessories/vs_indel_to_annovar.py" #Get it from here https://github.com/PoisonAlien/varscan_accessories | |
#for cnvScan | |
REFDICT="/usr/share/ref_genomes/hg19/hg19.dict" | |
#************************************************************************************************************************************* | |
function usage() { | |
echo "$(tput setaf 3) | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
usage : somatictoolkit <command> [options] | |
somatictoolkit - A set of optmized protocols for WXS data processing from cancer studies. | |
command: fq2bam aligns paired-end fastq files from WXS using bwa mem and processes them according to GATK best practices. | |
vScan wrapper around VarScan somatic command. Parameteres are hard coded to optmized settings which we found to work the best. | |
makePon Creates a Panel of Normals (PoNs) for Copy Number Analysis. | |
cnvScan Wraper around GATK4 copy-number protocol using PoNs to denoise. | |
author : anand mayakonda <[email protected]> | |
version: 1.0.0 | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
" | |
tput sgr0 | |
} | |
if [ $# -lt 1 ];then | |
usage | |
exit | |
fi | |
#************************************************************************************************************************************* | |
function fq2bam_usage() { | |
echo "$(tput setaf 3) | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
usage: somatictoolkit fq2bam [-R] [-t] [-D] output_fn Reference fwd.fq rev.fq | |
aligns paired-end fastq files from WXS using bwa mem and processes them according to GATK best practices. | |
bwa-mem -> samblaster (on the fly duplicate marking) -> BQSR -> analysis_ready_bam | |
Indel re-alignment has been deprecated. | |
positional arguments: | |
output_fn Basename for output file. Ususally sample name. | |
bwaidx Reference fasta file indexed with bwa. | |
fwd.fq Forward reads | |
rev.fq Reverse reads | |
optional arguments: | |
-D Output directory to store results. Optional. Default ./bams | |
-R Read Group of format '@RG\tID:foo\tSM:bar'. Default generates from output_fn. | |
-t threads to use. Default 4. | |
Example: fq2bam foo hg19.fa foo_1.fq.gz foo_2.fq.gz | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
" | |
tput sgr0 | |
} | |
#************************************************************************************************************************************* | |
function vScan_usage() { | |
echo "$(tput setaf 3) | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
usage: somatictoolkit vScan [-o] [-D] tumor normal | |
wrapper around VarScan somatic command. Parameteres are hard coded to values which we found to work the best. | |
Protocol: Varscan somatic -> Varscan processSomatic -> bam-readcount -> fpFilter -> analysis_ready_variants | |
Output : maflite (MAF with necessary fields required for annotation), which can be directly passed to maf2maf for variant annotation via VEP. | |
positional arguments: | |
tumor Tumor BAM file. Required. | |
normal Normal BAM file. Required. | |
optional arguments: | |
-o basename for output file. Optional. Default parses from tumor BAM file. | |
-D Output directory to store results. Optional. Default ./vScan_op | |
Example: vScan.sh tumor.bam normal.bam | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
" | |
tput sgr0 | |
} | |
#************************************************************************************************************************************* | |
function makePon_usage (){ | |
echo "$(tput setaf 3) | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
usage: somatictoolkit makePon [output_fn] [output_dir] normal_bam_list | |
positional arguments: | |
normal_bam_list file containing paths to bam files (one bam file per line) | |
optional arguments: | |
-o basename for output file. Optional. Default PoN | |
-D Output directory to store results. Optional. Default ./cnvScan/PoN/ | |
-R Exome bait file. 3 column BED file. | |
Example: somatictoolkit makePon normal_bams_list.tsv | |
# normal_bams_list.tsv must contain file paths to bam files (one bam file per line) in the following format: | |
# normal1.bam | |
# normal2.bam | |
# ... | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
" | |
tput sgr0 | |
} | |
#************************************************************************************************************************************* | |
function cnvScan_usage (){ | |
echo "$(tput setaf 3) | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
Program: cnvScan - Wraper around GATK4 tumor-normal pair Allelic Copy Number (ACNV) protocol. Requires GATK4 | |
positional arguments: | |
pon Panel of Normal file; .pon file created from makePon. Required | |
tumor Tumor BAM file. Required. | |
optional arguments: | |
-o basename for output files. Optional. Default parses from tumor BAM file. | |
-D Output directory to store results. Optional. Default ./cnvScan/ | |
Example: cnvScan.sh [-o] [-D] <PoN.pon> <tumor.bam> | |
--------------------------------------------------------------------------------------------------------------------------------------------------- | |
" | |
tput sgr0 | |
} | |
#************************************************************************************************************************************* | |
function setPaths() { | |
echo "$(tput setaf 3) | |
Set paths to GATK jar file, resource bundle and exome bait regions from line 7 to 13 of this script. | |
GATK jar file: https://software.broadinstitute.org/gatk/download/ | |
Download GATK resource bundle for your reference build from below ftp with your favourite ftp client (e.g; filezilla): | |
Host : ftp.broadinstitute.org | |
Username: gsapubftp-anonymous | |
" | |
tput sgr0 | |
} | |
#************************************************************************************************************************************* | |
function fq2bam_check() { | |
samb=`which samblaster || true` | |
if [ ! -f "$samb" ];then | |
echo "$(tput setaf 1)samblaster not found. Get it from here: https://github.com/GregoryFaust/samblaster" | |
tput sgr0 | |
exit | |
fi | |
samt=`which samtools || true` | |
if [ ! -f "$samt" ];then | |
echo "$(tput setaf 1)samtools not found. Get it from here: http://www.htslib.org/download/" | |
tput sgr0 | |
exit | |
fi | |
if [ ! -f "$GATK" ];then | |
echo "GATK $GATK file does not exists!" | |
setPaths | |
exit | |
fi | |
if [ ! -f "$millsIndels" ];then | |
setPaths | |
echo "$millsIndels file does not exists!" | |
exit | |
fi | |
if [ ! -f "$KGIndels" ];then | |
setPaths | |
echo "$(tput setaf 3)$KGIndels does not exists!" | |
tput sgr0 | |
exit | |
fi | |
if [ ! -f "$dbSNP138" ];then | |
setPaths | |
echo "$dbSNP138 does not exists!" | |
exit | |
fi | |
if [ ! -f "$REGIONS" ];then | |
echo "$(tput setaf 3)$REGIONS does not exists!" | |
tput sgr0 | |
exit | |
fi | |
} | |
#************************************************************************************************************************************* | |
function vscan_check() { | |
#Check if samtools is installed | |
samt=`which samtools || true` | |
if [ ! -f "$samt" ];then | |
echo "$(tput setaf 1)samtools not found. Try: sudo apt-get install samtools" | |
tput sgr0 | |
exit | |
fi | |
#Check if bam-readcount is installed | |
bamrc=`which bam-readcount || true` | |
if [ ! -f "$bamrc" ];then | |
echo "$(tput setaf 1)bam-readcount not found. Make sure its installed and under path. Get it from here: https://github.com/genome/bam-readcount" | |
tput sgr0 | |
exit | |
fi | |
if [ ! -f "$REFFILE" ];then | |
echo "$(tput setaf 3)$REFFILE does not exists! Set correct path to reference fasta at line 9 of this script." | |
tput sgr0 | |
exit | |
fi | |
if [ ! -f "$VARSCAN" ];then | |
echo "$(tput setaf 3)$VARSCAN does not exists! Set correct path to VarScan jar file at line 16-18 of this script." | |
tput sgr0 | |
exit | |
fi | |
if [ ! -f "$FPFILTER" ];then | |
echo "$(tput setaf 3)$FPFILTER does not exists! Get it from here https://github.com/ckandoth/variant-filter and set correct path at line 16-18 of this script." | |
tput sgr0 | |
exit | |
fi | |
if [ ! -f "$INDELPARSER" ];then | |
echo "$(tput setaf 3)$INDELPARSER does not exists! Get it from here https://github.com/PoisonAlien/varscan_accessories and set correct path at line 16-18 of this script." | |
tput sgr0 | |
exit | |
fi | |
} | |
#************************************************************************************************************************************* | |
function createPon_check() { | |
if [ ! -f "$REFFILE" ];then | |
echo "$(tput setaf 3)$REFFILE file does not exists! Set correct path to reference fasta at line 46 of this script." | |
tput sgr0 | |
exit | |
fi | |
if [ ! -f "$GATK" ];then | |
echo "$(tput setaf 3)$GATK does not exists! Set correct path to GATK4 jar file at line 47 of this script." | |
tput sgr0 | |
exit | |
fi | |
} | |
#************************************************************************************************************************************* | |
function fq2bam() { | |
if test -z "$2" | |
then | |
fq2bam_usage | |
echo "$2" | |
exit 1 | |
fi | |
fq2bam_check | |
output_dir="./bams/" | |
threads=4 | |
dt=`date --iso-8601` | |
rdgrp="" | |
while getopts ":hw:D:R:t:" OPTION | |
do | |
case "${OPTION}" in | |
h) | |
fq2bam_usage | |
exit 1 | |
;; | |
D) | |
output_dir="$OPTARG" | |
;; | |
R) | |
rdgrp="$OPTARG" | |
;; | |
t) | |
threads="$OPTARG" | |
;; | |
?) | |
unkOpt="$OPTARG" | |
fq2bam_usage | |
echo -e "Unknown option: ${unkOpt}" | |
exit | |
;; | |
esac | |
done | |
output_fn="${@:${OPTIND}:1}" | |
bwaidx="${@:$((${OPTIND}+1)):1}" | |
fq1="${@:$((${OPTIND}+2)):1}" | |
fq2="${@:$((${OPTIND}+3)):1}" | |
if [[ -z "$rdgrp" ]] | |
then | |
rdgrp="\"@RG\tID:${output_fn}\tSM:${output_fn}\tLB:${output_fn}\tPL:illumina\tPU:Unknown\tDT:${dt}\"" | |
fi | |
if [[ -z "$output_fn" ]] | |
then | |
fq2bam_usage | |
echo -e "Please provide a basename for output file. Usally sample name. \n" | |
exit 0 | |
elif [[ -z "$fq1" ]] || [[ ! -f "$fq1" ]] | |
then | |
fq2bam_usage | |
echo -e "forward fastq $fq1 does not exists!" | |
exit 0 | |
elif [[ -z "$fq2" ]] || [[ ! -f "$fq2" ]] | |
then | |
fq2bam_usage | |
echo -e "reverse fastq $fq2 does not exists!" | |
exit 0 | |
elif [[ -z "$bwaidx" ]] || [[ ! -f "$bwaidx" ]] | |
then | |
fq2bam_usage | |
echo -e "$bwaidx index does not exists" | |
exit 0 | |
fi | |
mkdir -p ${output_dir} | |
echo -e "$(tput setaf 3)$(date): Aligning and Removing duplicates.. \n" | |
algnCmd="bwa mem -M -v 1 -T 0 -R ${rdgrp} -t ${threads} ${bwaidx} ${fq1} ${fq2} 2> ${output_dir}/${output_fn}.bwa.log | samblaster -M 2> ${output_dir}/${output_fn}.samblaster.log | samtools view -@ ${threads} -bS - | samtools sort -@ ${threads} -o ${output_dir}/${output_fn}.raw.bam -" | |
eval $algnCmd | |
cat ${output_dir}/${output_fn}.samblaster.log | |
echo -e "\n$(tput setaf 3)$(date): Indexing raw BAM.. \n" | |
samtools index -@ ${threads} ${output_dir}/${output_fn}.raw.bam | |
cat ${REGIONS} | grep -v "^#" | awk '{print $1":"$2"-"$3}' > ${output_dir}/${output_fn}.exome.intervals | |
echo -e "$(tput setaf 3)$(date): Performing base quality recalibration.. \n" | |
echo -e "----------------------------------------------------------------" | |
bqsrCmd="java -d64 -Xmx4g -jar ${GATK} BaseRecalibrator --interval_padding 250 --intervals ${output_dir}/${output_fn}.exome.intervals --secondsBetweenProgressUpdates 30.0 --reference ${REFFILE} \ | |
--output ${output_dir}/${output_fn}.recal.table --knownSites ${dbSNP138} --knownSites ${KGIndels} --knownSites ${millsIndels} \ | |
--input ${output_dir}/${output_fn}.raw.bam > ${output_dir}/${output_fn}.bqsr.log" | |
eval ${bqsrCmd} | |
echo -e "----------------------------------------------------------------" | |
echo -e "\n$(tput setaf 3)$(date): Printing final BAM file.. \n" | |
echo -e "---------------------------------------------------" | |
applyCmd="java -d64 -Xmx4g -jar ${GATK} ApplyBQSR --input ${output_dir}/${output_fn}.raw.bam --output ${output_dir}/${output_fn}.bam | |
--bqsr_recal_file ${output_dir}/${output_fn}.recal.table --interval_padding 250 --intervals ${output_dir}/${output_fn}.exome.intervals > ${output_dir}/${output_fn}.ApplyBQSR.log" | |
eval ${applyCmd} | |
echo -e "---------------------------------------------------" | |
echo -e "\n$(tput setaf 3)$(date): Removing intermediate files and cleaning.. \n" | |
rm ${output_dir}/${output_fn}.raw.bam | |
rm ${output_dir}/${output_fn}.raw.bam.bai | |
rm ${output_dir}/${output_fn}.exome.intervals | |
echo -e "$(tput setaf 3)$(date): DONE \n" | |
} | |
#************************************************************************************************************************************* | |
function vScan() { | |
if test -z "$2" | |
then | |
vScan_usage | |
exit 1 | |
fi | |
#Default values for arguments | |
output_fn="" | |
output_dir="" | |
while getopts ":hw:D:o:" OPTION | |
do | |
case "${OPTION}" in | |
h) | |
vScan_usage | |
exit 1 | |
;; | |
D) | |
output_dir="$OPTARG" | |
;; | |
o) | |
output_fn="$OPTARG" | |
;; | |
?) | |
unkOpt="$OPTARG" | |
vScan_usage | |
echo -e "Unknown option: ${unkOpt}" | |
exit | |
;; | |
esac | |
done | |
vscan_check | |
#positional arguments | |
tumor="${@:${OPTIND}:1}" | |
normal="${@:$((${OPTIND}+1)):1}" | |
if [[ -z "$tumor" ]] | |
then | |
vScan_usage | |
echo -e "Missing positional arguments. \n" | |
exit 0 | |
elif [[ -z "$tumor" ]] || [[ ! -f "$tumor" ]] | |
then | |
vScan_usage | |
echo -e "$tumor BAM does not exists!" | |
exit 0 | |
elif [[ -z "$normal" ]] || [[ ! -f "$normal" ]] | |
then | |
vScan_usage | |
echo -e "$normal BAM does not exists!" | |
exit 0 | |
fi | |
if [[ -z "$output_fn" ]] | |
then | |
output_fn=`basename $tumor | sed 's/.bam$//I'` | |
fi | |
if [[ -z "$output_dir" ]] | |
then | |
output_dir="./varscan_op/$output_fn/" | |
fi | |
#------------------Run VarScan commands ----------------------------------- | |
mkdir -p $output_dir"/raw_somatic_op/" | |
echo -e "#Main command:" > $output_dir"/$output_fn.commands.log" | |
echo -e "vScan.sh $tumor $normal $output_fn $output_dir\n" >> $output_dir"/$output_fn.commands.log" | |
echo -e "$(tput setaf 3)$(date): Running Varscan Somatic.. \n" | |
tput sgr0 | |
vsCmd="samtools mpileup -B -f $REFFILE -q 10 -L 10000 -d 10000 $normal $tumor | | |
java -Xmx4g -d64 -jar $VARSCAN somatic -mpileup $output_dir"/raw_somatic_op/$output_fn" --min-coverage-normal 10 --min-coverage-tumor 14 --min-var-freq 0.02 --strand-filter 1" | |
echo -e "#Internal sub commands at each step:" >> $output_dir"/$output_fn.commands.log" | |
echo -e "#VarScan somatic command" >> $output_dir"/$output_fn.commands.log" | |
echo -e $vsCmd >> $output_dir"/$output_fn.commands.log" | |
eval $vsCmd | |
echo -e "$(tput setaf 3)\n$(date): Running Varscan processSomatic for SNPs.. \n" | |
tput sgr0 | |
vsPsSNPcmd="java -jar $VARSCAN processSomatic $output_dir"/raw_somatic_op/$output_fn.snp" --min-tumor-freq 0.03 --max-normal-freq 0.01 --p-value 0.05" | |
echo -e "\n#VarScan processSomatic command" >> $output_dir"/$output_fn.commands.log" | |
echo -e $vsPsSNPcmd >> $output_dir"/$output_fn.commands.log" | |
eval $vsPsSNPcmd | |
echo -e "$(tput setaf 3)\n$(date): Running Varscan processSomatic for INDELS.. \n" | |
tput sgr0 | |
vsPsINDELcmd="java -jar $VARSCAN processSomatic $output_dir"/raw_somatic_op/$output_fn.indel" --min-tumor-freq 0.03 --max-normal-freq 0.01 --p-value 0.01" | |
echo -e $vsPsINDELcmd >> $output_dir"/$output_fn.commands.log" | |
eval $vsPsINDELcmd | |
#------------------ Move files into clean directories.---------------------------------------------------- | |
mkdir -p $output_dir"/procesSomatic_op/SNP/" | |
mkdir -p $output_dir"/procesSomatic_op/SNP/LOH/" | |
mkdir -p $output_dir"/procesSomatic_op/SNP/Germline/" | |
mkdir -p $output_dir"/procesSomatic_op/SNP/Somatic/" | |
mv $output_dir"/raw_somatic_op/$output_fn.snp.LOH"* $output_dir"/procesSomatic_op/SNP/LOH/" | |
mv $output_dir"/raw_somatic_op/$output_fn.snp.Germline"* $output_dir"/procesSomatic_op/SNP/Germline/" | |
mv $output_dir"/raw_somatic_op/$output_fn.snp.Somatic"* $output_dir"/procesSomatic_op/SNP/Somatic/" | |
mkdir -p $output_dir"/procesSomatic_op/INDEL/" | |
mkdir -p $output_dir"/procesSomatic_op/INDEL/LOH/" | |
mkdir -p $output_dir"/procesSomatic_op/INDEL/Germline/" | |
mkdir -p $output_dir"/procesSomatic_op/INDEL/Somatic/" | |
mv $output_dir"/raw_somatic_op/$output_fn.indel.LOH"* $output_dir"/procesSomatic_op/INDEL/LOH/" | |
mv $output_dir"/raw_somatic_op/$output_fn.indel.Germline"* $output_dir"/procesSomatic_op/INDEL/Germline/" | |
mv $output_dir"/raw_somatic_op/$output_fn.indel.Somatic"* $output_dir"/procesSomatic_op/INDEL/Somatic/" | |
#------------------ Process INDELS-------------------------------------------------------------------------- | |
echo -e "$(tput setaf 3)\n$(date): Preparing INDEL file for VEP annotation.. \n" | |
tput sgr0 | |
echo -e "Chromosome\tStart_Position\tReference_Allele\tTumor_Seq_Allele2\tt_depth\tt_vaf\tTumor_Sample_Barcode\tCenter\tVerification_Status\tValidation_Status\tMutation_Status\tValidation_Method\tBAM_File\tSequencer" > $output_dir"/procesSomatic_op/INDEL/Somatic/$output_fn.indel.Somatic.hc.mafLite" | |
indelCmd="python $INDELPARSER --min-var-count 4 --min-depth 14 --min-var-frac 0.03 --min-depth-normal 10 $output_dir"/procesSomatic_op/INDEL/Somatic/$output_fn.indel.Somatic.hc"" | |
echo -e "\n#Preparing INDEL file for VEP annotation" >> $output_dir"/$output_fn.commands.log" | |
echo -e $indelCmd >> $output_dir"/$output_fn.commands.log" | |
eval $indelCmd | awk -v tsb="$output_fn" -v bam="$tumor" '{OFS="\t"; print $1,$2+1,$4,$5,$9+$10,$11/100,tsb,"CSI-NUS","Unknown","Untested","Somatic","No",bam,"Illumina HiSeq"}' >> $output_dir"/procesSomatic_op/INDEL/Somatic/$output_fn.indel.Somatic.hc.mafLite" #+1 to loci to void issues with zero based cordinates from Indels | |
#------------------ Process SNPS-------------------------------------------------------------------------- | |
echo -e "$(tput setaf 3)$(date): Filtering SNPs for false-positives.. \n" | |
tput sgr0 | |
mkdir -p $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/" | |
sed 1d $output_dir"/procesSomatic_op/SNP/Somatic/$output_fn.snp.Somatic.hc" | awk '{OFS="\t"; print $1,$2,$2,$3,$4}' > $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.var" | |
sed 1d $output_dir"/procesSomatic_op/SNP/Somatic/$output_fn.snp.Somatic.hc" | awk '{OFS="\t"; print $1,$2,$3,$4}' > $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.varFile" | |
echo -e "$(tput setaf 3)$(date): Extracting read-counts for high confident Somatic SNPs.. \n" | |
tput sgr0 | |
bamrcCmd="bam-readcount -q10 -w1 -b10 -l $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.var" -f $REFFILE $tumor > $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.readcounts"" | |
echo -e "\n#Bam-readcount command" >> $output_dir"/$output_fn.commands.log" | |
echo -e $bamrcCmd >> $output_dir"/$output_fn.commands.log" | |
eval $bamrcCmd | |
echo -e "$(tput setaf 3)\n$(date): Running fpfilter.. \n" | |
tput sgr0 | |
fpFilterCmd="perl $FPFILTER --var-file $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.varFile" --readcount-file $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.readcounts" --min-depth 14 --min-read-pos 0.1 --min-strandedness 0.01 --max-mmqs-diff 90 --min-var-count 4 --min-var-frac 0.03 --min-var-dist-3 0.1 --max-var-mmqs 100 --output-file $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter"" | |
echo -e "\n#FpFilter command" >> $output_dir"/$output_fn.commands.log" | |
echo -e $fpFilterCmd >> $output_dir"/$output_fn.commands.log" | |
eval $fpFilterCmd | |
echo -e "Chromosome\tStart_Position\tReference_Allele\tTumor_Seq_Allele2\tt_depth\tt_vaf\tTumor_Sample_Barcode\tCenter\tVerification_Status\tValidation_Status\tMutation_Status\tValidation_Method\tBAM_File\tSequencer" > $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter.mafLite" | |
grep 'PASS' $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter" | awk -v tsb="$output_fn" -v bam="$tumor" '{OFS="\t" ; print $1,$2,$3,$4,$5,$7,tsb,"CSI-NUS","Unknown","Untested","Somatic","No",bam,"Illumina HiSeq"}' >> $output_dir"/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter.mafLite" | |
echo -e "\n#Only PASS variants from fpFilter output are used as final results for SNPs" >> $output_dir"/$output_fn.commands.log" | |
#mkdir -p $output_dir"final_vars" | |
echo -e "Chromosome\tStart_Position\tReference_Allele\tTumor_Seq_Allele2\tt_depth\tt_vaf\tTumor_Sample_Barcode\tCenter\tVerification_Status\tValidation_Status\tMutation_Status\tValidation_Method\tBAM_File\tSequencer" > $output_dir"/$output_fn.somatic.mafLite" | |
#------------------ Write final Somatic MAF-------------------------------------------------------------------------- | |
cat <(sed 1d "$output_dir/procesSomatic_op/SNP/Somatic/fpFilter/$output_fn.fpfilter.mafLite") <(sed 1d "$output_dir/procesSomatic_op/INDEL/Somatic/$output_fn.indel.Somatic.hc.mafLite") | sed 's/^chr//' >> $output_dir"/$output_fn.somatic.mafLite" | |
vars=`sed 1d $output_dir"/$output_fn.somatic.mafLite" | wc -l` | |
echo -e "$(tput setaf 3)\n$(date): Number of high confidence, false positive filtered variants (INDELS are tricky): $vars [$output_dir"/$output_fn.somatic.mafLite"]" | |
echo -e "$(tput setaf 3)\n$(date): Done!\n" | |
tput sgr0 | |
} | |
#************************************************************************************************************************************* | |
function makePon() { | |
if test -z "$1" | |
then | |
makePon_usage | |
exit 1 | |
fi | |
output_fn="" | |
output_dir="" | |
while getopts ":hw:D:R:o:" OPTION | |
do | |
case "${OPTION}" in | |
h) | |
makePon_usage | |
exit 1 | |
;; | |
D) | |
output_dir="$OPTARG" | |
;; | |
o) | |
output_fn="$OPTARG" | |
;; | |
R) | |
REGIONS="$OPTARG" | |
;; | |
?) | |
unkOpt="$OPTARG" | |
makePon_usage | |
echo -e "Unknown option: ${unkOpt}" | |
exit | |
;; | |
esac | |
done | |
#Default values for arguments | |
if [[ -z "$output_fn" ]] | |
then | |
output_fn="PoN" | |
fi | |
if [[ -z "$output_dir" ]] | |
then | |
output_dir="./cnvScan/$output_fn/" | |
fi | |
createPon_check | |
if [ ! -f "$REGIONS" ];then | |
echo "$(tput setaf 3)Exome baits $REGIONS does not exists! Set correct path at line 8 of this script or provide one with argument -R." | |
tput sgr0 | |
exit | |
fi | |
if [ ! -f "$REFDICT" ];then | |
echo "$(tput setaf 3)Referece dictionary $REFDICT does not exists! Set correct path to Reference dict file at line 21 of this script. Cerating one for now.." | |
tput sgr0 | |
$REFDICT=$REFFILE.dict | |
java -jar $GATK CreateSequenceDictionary -R $REFFILE --output $REFDICT --QUIET true | |
fi | |
#positional arguments | |
normal="${*: -1:1}" | |
#echo -e "$output_fn $output_dir $REGIONS $REFDICT $normal" | |
#exit | |
mkdir -p $output_dir | |
#Make a tragets file from bait BED file | |
echo -e "contig\tstart\tstop\tname" > $output_dir/$output_fn".targets" | |
grep -v "^#" $REGIONS | sort -V -k1,2 | cut -f 1-3 | awk '{OFS="\t"; print $0,"target_"NR}' >> $output_dir/$output_fn".targets" | |
echo -e "$(tput setaf 3)\n$(date): Collecting target coverage for all normal samples.. \n" | |
touch $output_dir/$output_fn.colCov.temp.txt | |
cat $normal | grep -v "^#" | while read line | |
do | |
if [ ! -f "$line" ] | |
then | |
echo "$line does not exists! Skipping.." | |
else | |
fbn=`basename $line | sed 's/.bam//I'` | |
echo -e "java -Xmx4g -d64 -jar ${GATK} CalculateTargetCoverage --input $line --reference $REFFILE --targets $output_dir/$output_fn.targets --groupBy SAMPLE --transform PCOV --targetInformationColumns FULL --interval_set_rule UNION --interval_padding 250 --secondsBetweenProgressUpdates 30.0 --disableToolDefaultReadFilters false --disableSequenceDictionaryValidation true --disableReadFilter NotDuplicateReadFilter --output $output_dir/$fbn.coverage.tsv" >> $output_dir/$output_fn.colCov.temp.txt | |
fi | |
done | |
tput sgr0 | |
cat $output_dir/$output_fn.colCov.temp.txt | parallel -j 20 --progress > $output_dir/$output_fn.colCov.log | |
rm $output_dir/$output_fn.colCov.temp.txt | |
ls $output_dir | grep .coverage.tsv$ | awk -v var="$output_dir/" '{print var$0}' > $output_dir/$output_fn.coverageFileList.tsv | |
echo -e "$(tput setaf 3)\n$(date): Merging per-sample coverage info into single file.. \n" | |
java -d64 -Xmx4g -jar $GATK CombineReadCounts \ | |
--inputList $output_dir/$output_fn.coverageFileList.tsv \ | |
--maxOpenFiles 100 \ | |
--output $output_dir/$output_fn.tsv > $output_dir/$output_fn.CombineReadCounts.log | |
echo -e "$(tput setaf 3)\n$(date): Creating targets with GC annotations.. \n" | |
java -Xmx4g -jar $GATK AnnotateTargets \ | |
--targets $output_dir/$output_fn".targets" \ | |
--reference $REFFILE \ | |
--output $output_dir/$output_fn".annotated.tsv" > $output_dir/$output_fn.AnnotateTargets.log | |
echo -e "$(tput setaf 3)\n$(date): Correcting coverage profile for sample-specific GC bias.. \n" | |
java -Xmx4g -jar $GATK CorrectGCBias \ | |
--input $output_dir/$output_fn.tsv \ | |
--targets $output_dir/$output_fn".annotated.tsv" \ | |
--output $output_dir/$output_fn.gc_corrected.tsv > $output_dir/$output_fn.CorrectGCBias.log | |
echo -e "$(tput setaf 3)\n$(date): Create panel of normals from combined, GC-corrected coverage profiles.. \n" | |
java -Xmx4g -jar $GATK CreatePanelOfNormals \ | |
--input $output_dir/$output_fn.gc_corrected.tsv \ | |
--extremeColumnMedianCountPercentileThreshold 2.5 \ | |
--truncatePercentileThreshold 0.1 \ | |
--noQC false \ | |
--output $output_dir/$output_fn.pon > $output_dir/$output_fn.CreatePanelOfNormals.log | |
echo -e "$(tput setaf 3)\n$(date): DONE!" | |
tput sgr0 | |
} | |
#************************************************************************************************************************************* | |
function cnvScan() { | |
if test -z "$1" | |
then | |
cnvScan_usage | |
exit 1 | |
fi | |
output_fn="" | |
output_dir="" | |
while getopts ":hw:D:o:" OPTION | |
do | |
case "${OPTION}" in | |
h) | |
cnvScan_usage | |
exit 1 | |
;; | |
D) | |
output_dir="$OPTARG" | |
;; | |
o) | |
output_fn="$OPTARG" | |
;; | |
?) | |
unkOpt="$OPTARG" | |
makePon_usage | |
echo -e "Unknown option: ${unkOpt}" | |
exit | |
;; | |
esac | |
done | |
#positional arguments | |
pon="${@:${OPTIND}:1}" | |
tumor="${@:$((${OPTIND}+1)):1}" | |
if [[ -z "$pon" ]] || [[ ! -f "$pon" ]] | |
then | |
cnvScan_usage | |
echo -e "Panel of Normal file $pon does not exists" | |
exit | |
fi | |
if [[ -z "$tumor" ]] || [[ ! -f "$tumor" ]] | |
then | |
cnvScan_usage | |
echo -e "Tumor $tumor does not exists" | |
exit | |
fi | |
#Default values for arguments | |
if [[ -z "$output_fn" ]] | |
then | |
output_fn=`basename $tumor | sed 's/.bam//I'` | |
fi | |
if [[ -z "$output_dir" ]] | |
then | |
output_dir="./cnvScan/$output_fn" | |
fi | |
if [ ! -f "$REFDICT" ];then | |
echo "$(tput setaf 3)Referece dictionary $REFDICT does not exists! Set correct path to Reference dict file at line 21 of this script. Cerating one for now.." | |
tput sgr0 | |
$REFDICT=$REFFILE.dict | |
java -jar $GATK CreateSequenceDictionary -R $REFFILE --output $REFDICT --QUIET true | |
fi | |
mkdir -p $output_dir | |
mkdir -p $output_dir/$output_fn"_log" | |
#Make a tragets file from bait BED file | |
echo -e "contig\tstart\tstop\tname" > $output_dir/$output_fn".targets" | |
grep -v "^#" $REGIONS | sort -V -k1,2 | cut -f 1-3 | awk '{OFS="\t"; print $0,"target_"NR}' >> $output_dir/$output_fn".targets" | |
#make picard interval list for heterozygous SNPs from normal | |
#echo "Creating Interval List for heterozygous SNPs" | |
#cat $snp | sed 1d | sed 's/%//g' | grep -v LOH | awk '{if($7 > 10 && $7 < 75) print $0}' | awk '{if($14 < 1E-2) print $0}' | awk '{OFS="\t" ; if($5+$6 > 25) print $1,$2,$2,$13":"$1":"$2}' > $output_dir/$output_fn".common_sites.bed" | |
#java -Xmx4G -d64 -jar $GATK BedToIntervalList --input $output_dir/$output_fn".common_sites.bed" --output $output_dir/$output_fn".common_sites.IntervalList" --SEQUENCE_DICTIONARY $REFDICT --QUIET true | |
echo -e "$(tput setaf 3)\n$(date): Padding 250bp around exome bait regions.. \n" | |
java -d64 -Xmx1g -jar $GATK PadTargets \ | |
--targets $output_dir/$output_fn".targets" \ | |
--padding 250 \ | |
--output $output_dir/$output_fn".targets.padded.tsv" > $output_dir/$output_fn"_log"/PadTargets.log | |
echo -e "$(tput setaf 3)\n$(date): Collecting target coverage for $tumor.. \n" | |
java -d64 -Xmx4g -jar $GATK CalculateTargetCoverage \ | |
--input $tumor \ | |
--reference $REFFILE \ | |
--targets $output_dir/$output_fn".targets.padded.tsv" \ | |
--groupBy SAMPLE \ | |
--transform PCOV \ | |
--targetInformationColumns FULL \ | |
--interval_set_rule UNION \ | |
--interval_padding 0 \ | |
--secondsBetweenProgressUpdates 10.0 \ | |
--disableToolDefaultReadFilters false \ | |
--disableSequenceDictionaryValidation true \ | |
--disableReadFilter NotDuplicateReadFilter \ | |
--output $output_dir/$output_fn.tumor.coverage.tsv > $output_dir/$output_fn"_log"/CalculateTargetCoverage.log | |
echo -e "$(tput setaf 3)\n$(date): Annotating targets.. \n" | |
java -Xmx4g -jar $GATK AnnotateTargets \ | |
--targets $output_dir/$output_fn".targets.padded.tsv" \ | |
--reference $REFFILE \ | |
--output $output_dir/$output_fn.annotated.tsv > $output_dir/$output_fn"_log"/AnnotateTargets.log | |
echo -e "$(tput setaf 3)\n$(date): Correcting for GCBias.. \n" | |
java -Xmx4g -jar $GATK CorrectGCBias \ | |
--input $output_dir/$output_fn.tumor.coverage.tsv \ | |
--targets $output_dir/$output_fn.annotated.tsv \ | |
--output $output_dir/$output_fn.gc_corrected.tsv > $output_dir/$output_fn"_log"/CorrectGCBias.log | |
echo -e "$(tput setaf 3)\n$(date): Normalizing Readcounts.. \n" | |
java -Xmx4g -jar $GATK NormalizeSomaticReadCounts \ | |
--input $output_dir/$output_fn.gc_corrected.tsv \ | |
--targets $output_dir/$output_fn".targets.padded.tsv" \ | |
--panelOfNormals $pon \ | |
--tangentNormalized $output_dir/$output_fn.tn.tsv \ | |
--factorNormalizedOutput $output_dir/$output_fn.fnt.tsv \ | |
--preTangentNormalized $output_dir/$output_fn.preTN.tsv \ | |
--betaHatsOutput $output_dir/$output_fn.betaHats.tsv > $output_dir/$output_fn"_log"/NormalizeSomaticReadCounts.log | |
echo -e "$(tput setaf 3)\n$(date): Performing Segmentation.. \n" | |
java -Xmx4g -jar $GATK PerformSegmentation \ | |
--tangentNormalized $output_dir/$output_fn.tn.tsv \ | |
--log2Input true \ | |
--alpha 0.01 \ | |
--eta 0.05 \ | |
--kmax 25 \ | |
--minWidth 2 \ | |
--nmin 200 \ | |
--nperm 10000 \ | |
--pmethod HYBRID \ | |
--trim 0.025 \ | |
--undoPrune 0.05 \ | |
--undoSD 2 \ | |
--undoSplits NONE \ | |
--output $output_dir/$output_fn.seg > $output_dir/$output_fn"_log"/PerformSegmentation.log | |
echo -e "$(tput setaf 3)\n$(date): Annotating Segments.. \n" | |
java -Xmx4g -jar $GATK CallSegments \ | |
--tangentNormalized $output_dir/$output_fn.tn.tsv \ | |
--segments $output_dir/$output_fn.seg \ | |
--legacy false \ | |
--output $output_dir/$output_fn.seg.called > $output_dir/$output_fn"_log"/CallSegments.log | |
echo -e "$(tput setaf 3)\n$(date): Plotting Segments.. \n" | |
java -Xmx4g -jar $GATK PlotSegmentedCopyRatio \ | |
--tangentNormalized $output_dir/$output_fn.tn.tsv \ | |
--preTangentNormalized $output_dir/$output_fn.preTN.tsv \ | |
--segments $output_dir/$output_fn.seg.called \ | |
-SD $REFDICT \ | |
--output $output_dir \ | |
--outputPrefix $output_fn > $output_dir/$output_fn"_log"/PlotSegmentedCopyRatio.log | |
echo -e "$(tput setaf 3)\n$(date): DONE! \n" | |
tput sgr0 | |
} | |
#************************************************************************************************************************************* | |
case "$1" in | |
'fq2bam') | |
fq2bam "${@:2}" | |
;; | |
'vScan') | |
vScan "${@:2}" | |
;; | |
'somatic') | |
somatic "${@:2}" | |
;; | |
'makePon') | |
makePon "${@:2}" | |
;; | |
'cnvScan') | |
cnvScan "${@:2}" | |
;; | |
*) | |
usage | |
echo -e "Error: command \"$1\" not recognized\n" | |
exit 1 | |
esac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment