edawson · May 27, 2022 06:10
diff --git a/v3.7_accelerated_annotation_filtering_and_qc.sh b/v3.7_accelerated_annotation_filtering_and_qc.sh
 #!/bin/bash

 ########################
 ## In this gist, we'll reuse the commands from our 3.6 tutorial to align reads and generate BAM files.
 ## Check out the full post at https://medium.com/@johnnyisraeli/accelerating-germline-and-somatic-genomic-analysis-of-whole-genomes-and-exomes-with-nvidia-clara-e3deeae2acc9
 and Gists at:
 ## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-full_pipeline_and_data_prep_parabricks3-6-sh
 ## and 
 ## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-step_1_align_reads_parabricks3-6-sh
 ###########

 ###########
 ## We'll run this tutorial on a GCP VM with 64 vCPUs, 240GB of RAM, and 8x NVIDIA V100 GPUs
 ## To save costs, you can also run this on a GCP VM with 32 vCPUS, 120GB of RAM, and 4x V100 GPUs
 ###########


 ## After aligning our reads, we'll rerun the variant calling stages of our past gist
 ## since we've updated the haplotypecaller and DeepVariant tools. We'll
 ## also run Strelka2 as an additional variant caller.
 ##
 ## After that, we'll merge our VCFs to generate a union callset and an intersection VCF
 ## with variants called by all three variant callers, annotate our new intersection VCF,
 ## and remove variants that fail certain criteria for population frequency.
 ## Finally, we'll run our vcfqc and vcfqcbybam tools to generate simple quality control reports.
 #############

  
 ################
 ## HaplotypeCaller
 ## This step should take roughly 15 minutes on our 8xV100 VM.
 ################
 time pbrun haplotypecaller \
 --ref ~/refs/Homo_sapiens_assembly38.fasta \
 --in-bam   HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
 --in-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt \
 --out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf
  
 ################
 ## DeepVariant
 ## This step should take approximately 20 minutes on an 8xV100 VM
 ################
 time pbrun deepvariant \
 --ref Homo_sapiens_assembly38.fasta  \
 --in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam  \
 --out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf
   
 ###############
 ## Strelka
 ## This step should take ~10 minutes on a 64-core VM.
 ###############
 time pbrun strelka \
 --ref Homo_sapiens_assembly38.fasta \
 --in-bams HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
 --out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka \
 --num-threads 64
 
 
 ## Copy strelka results to current directory.
 cp HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka.strelka_work/results/variants/variants.vcf.gz* .
   
 ## BGZIP and tabix-index the deepvariant VCFs
 bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf
 tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz
   
 ## BGZIP and tabix index the haplotypecaller VCFs
 bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf
 tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz
  
  
 ## Run the votebasedvcfmerger tool to generate a union and intersection VCF.
 time pbrun votebasedvcfmerger \
 --in-vcf strelka:variants.vcf.gz \
 --in-vcf deepvariant:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz \
 --in-vcf haplotypecaller:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz \
 --min-votes 3
 --out-dir HG002.realign.vbvm
 
 ## The HG002.realign.vbvm directory should now contain a
 ## unionVCF.vcf file with the union callset of HaplotypeCaller, Strelka, and DeepVariant
 ## and aa filteredVCF.vcf file with only calls produced by all three callers.
   
 ## Annotate the intersection VCF with gnomAD, ClinVar, 1000 Genomes
 ## Download our annotation VCFs and tabix indices
 wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz
 wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi
   
 wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
 wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi
   
 wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz
 wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz.tbi

 ## Download an Ensembl GTF to annotate the VCF file with gene names
 wget http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz
 ## Unzip the GTF file and add the "chr" prefix to the chromosome names (Ensembl excludes this prefix by default.
 gunzip Homo_sapiens.GRCh38.105.gtf.gz
 awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' Homo_sapiens.GRCh38.105.gtf > Homo_sapiens.GRCh38.105.chr.gtf

 time pbrun snpswift \
 --input-vcf HG002.realign.vbvm/filteredVCF.vcf \
 --anno-vcf 1000Genomes:ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz \
 --anno-vcf gnomad_v2.1.1:gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz \
 --anno-vcf ClinVar:clinvar.vcf.gz \
 --ensembl Homo_sapiens.GRCh38.105.chr.gtf  \
 --output-vcf HG002.realign.3callers.annotated.vcf

 ##################
 ## frequencyfiltration
 ## Next we'll filter our VCF to remove variants with 1000Genomes allele frequency > 0.05
 ## and gnomAD AF < 0.05
 ##################
 time pbrun frequencyfiltration \
 --in-vcf HG002.realign.3callers.annotated.vcf \
 --and-expression "1000Genomes_AF < 0.05" \
 --and-expression "gnomad_v2.1.1_AF < 0.05" \
 --out-vcf HG002.realign.3callers.annotated.filtered.vcf

 ##################
 ## Finally, we'll run our automated vcfqc tool to generate some
 ## basic QC stats. The vcfqcbybam tool could also be run 
 ## to produce QC stats using an auxilliary BAM file (e.g., when variant calls don't have the desired fields).
 ##################
 time pbrun vcfqc --in-vcf HG002.realign.3callers.annotated.filtered.vcf \
 --output-dir HG002.realign.3callers.annotated.filtered.qc \
 --depth haplotypecaller_DP --allele-depth deepvariant_AD
	#!/bin/bash

	########################
	## In this gist, we'll reuse the commands from our 3.6 tutorial to align reads and generate BAM files.
	## Check out the full post at https://medium.com/@johnnyisraeli/accelerating-germline-and-somatic-genomic-analysis-of-whole-genomes-and-exomes-with-nvidia-clara-e3deeae2acc9
	and Gists at:
	## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-full_pipeline_and_data_prep_parabricks3-6-sh
	## and
	## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-step_1_align_reads_parabricks3-6-sh
	###########

	###########
	## We'll run this tutorial on a GCP VM with 64 vCPUs, 240GB of RAM, and 8x NVIDIA V100 GPUs
	## To save costs, you can also run this on a GCP VM with 32 vCPUS, 120GB of RAM, and 4x V100 GPUs
	###########


	## After aligning our reads, we'll rerun the variant calling stages of our past gist
	## since we've updated the haplotypecaller and DeepVariant tools. We'll
	## also run Strelka2 as an additional variant caller.
	##
	## After that, we'll merge our VCFs to generate a union callset and an intersection VCF
	## with variants called by all three variant callers, annotate our new intersection VCF,
	## and remove variants that fail certain criteria for population frequency.
	## Finally, we'll run our vcfqc and vcfqcbybam tools to generate simple quality control reports.
	#############


	################
	## HaplotypeCaller
	## This step should take roughly 15 minutes on our 8xV100 VM.
	################
	time pbrun haplotypecaller \
	--ref ~/refs/Homo_sapiens_assembly38.fasta \
	--in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
	--in-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt \
	--out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf

	################
	## DeepVariant
	## This step should take approximately 20 minutes on an 8xV100 VM
	################
	time pbrun deepvariant \
	--ref Homo_sapiens_assembly38.fasta \
	--in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
	--out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf

	###############
	## Strelka
	## This step should take ~10 minutes on a 64-core VM.
	###############
	time pbrun strelka \
	--ref Homo_sapiens_assembly38.fasta \
	--in-bams HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
	--out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka \
	--num-threads 64


	## Copy strelka results to current directory.
	cp HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka.strelka_work/results/variants/variants.vcf.gz* .

	## BGZIP and tabix-index the deepvariant VCFs
	bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf
	tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz

	## BGZIP and tabix index the haplotypecaller VCFs
	bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf
	tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz


	## Run the votebasedvcfmerger tool to generate a union and intersection VCF.
	time pbrun votebasedvcfmerger \
	--in-vcf strelka:variants.vcf.gz \
	--in-vcf deepvariant:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz \
	--in-vcf haplotypecaller:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz \
	--min-votes 3
	--out-dir HG002.realign.vbvm

	## The HG002.realign.vbvm directory should now contain a
	## unionVCF.vcf file with the union callset of HaplotypeCaller, Strelka, and DeepVariant
	## and aa filteredVCF.vcf file with only calls produced by all three callers.

	## Annotate the intersection VCF with gnomAD, ClinVar, 1000 Genomes
	## Download our annotation VCFs and tabix indices
	wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz
	wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi

	wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
	wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi

	wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz
	wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz.tbi

	## Download an Ensembl GTF to annotate the VCF file with gene names
	wget http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz
	## Unzip the GTF file and add the "chr" prefix to the chromosome names (Ensembl excludes this prefix by default.
	gunzip Homo_sapiens.GRCh38.105.gtf.gz
	awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' Homo_sapiens.GRCh38.105.gtf > Homo_sapiens.GRCh38.105.chr.gtf

	time pbrun snpswift \
	--input-vcf HG002.realign.vbvm/filteredVCF.vcf \
	--anno-vcf 1000Genomes:ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz \
	--anno-vcf gnomad_v2.1.1:gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz \
	--anno-vcf ClinVar:clinvar.vcf.gz \
	--ensembl Homo_sapiens.GRCh38.105.chr.gtf \
	--output-vcf HG002.realign.3callers.annotated.vcf

	##################
	## frequencyfiltration
	## Next we'll filter our VCF to remove variants with 1000Genomes allele frequency > 0.05
	## and gnomAD AF < 0.05
	##################
	time pbrun frequencyfiltration \
	--in-vcf HG002.realign.3callers.annotated.vcf \
	--and-expression "1000Genomes_AF < 0.05" \
	--and-expression "gnomad_v2.1.1_AF < 0.05" \
	--out-vcf HG002.realign.3callers.annotated.filtered.vcf

	##################
	## Finally, we'll run our automated vcfqc tool to generate some
	## basic QC stats. The vcfqcbybam tool could also be run
	## to produce QC stats using an auxilliary BAM file (e.g., when variant calls don't have the desired fields).
	##################
	time pbrun vcfqc --in-vcf HG002.realign.3callers.annotated.filtered.vcf \
	--output-dir HG002.realign.3callers.annotated.filtered.qc \
	--depth haplotypecaller_DP --allele-depth deepvariant_AD