ckandoth · July 22, 2025 15:31
diff --git a/pgscalc_with_wgs_gvcfs.sh b/pgscalc_with_wgs_gvcfs.sh
 # GOAL: Prototype a bioinformatics pipeline to calculate Polygenic Risk Scores (PRS) using WGS gVCFs.

 # Let's use pgscalc (https://github.com/pgscatalog/pgsc_calc) a nextflow pipeline to calculate PRS, given PR weights and
 # a multi-sample VCF. Variant allele weights can be specified either as PGS Catalog IDs (--pgs_id) and/or as custom scoring
 # files (--scorefile). pgscalc can also perform liftover if scoring files use a different reference genome build than the
 # input VCFs (--liftover --target_build).

 # Steps below were tested on Ubuntu 24.04. But should work fine with any Linux server using bash.

 # ----- #
 # TOOLS #
 # ----- #

 # Download and install conda to help us manage our tools and dependencies.
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
 bash /tmp/miniconda.sh -bup ~/miniconda3 && rm /tmp/miniconda.sh

 # Modify ~/.bashrc to load conda on login, and then exit.
 cat >> ~/.bashrc << 'EOF'
 # Add conda to PATH if found
 if [ -f "$HOME/miniconda3/etc/profile.d/conda.sh" ]; then
    . $HOME/miniconda3/etc/profile.d/conda.sh
 fi
 EOF
 exit

 # Log back in, accept conda TOS, update it to the latest version, and configure it to use libmamba, a faster dependency solver.
 conda tos accept --channel defaults
 conda update -y conda
 conda config --set solver libmamba

 # Use conda to install the tools we need and their dependencies.
 conda create -y -n prs -c bioconda -c conda-forge -c defaults bcftools==1.20 samtools==1.20 htslib==1.20 parallel==20250622 nextflow==25.04.6

 # bs (BaseSpace CLI) cannot be installed using conda, so let's just do it manually.
 curl -L https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs -o $HOME/miniconda3/envs/prs/bin/bs
 chmod +x $HOME/miniconda3/envs/prs/bin/bs

 # Everytime we resume work on this project, we'll need to make sure we activate this conda env.
 conda activate prs

 # ------ #
 # ASSETS #
 # ------ #

 # Download the recommended ancestry reference database (HGDP + 1000 Genomes in hg38) for use with pgscalc.
 mkdir assets
 wget -P assets https://ftp.ebi.ac.uk/pub/databases/spot/pgs/resources/pgsc_HGDP+1kGP_v1.tar.zst

 # Make a VCF of all autosomal ancestry alleles. Add a fake sample "null_sample" for use with "bcftools merge" later.
 echo -e "##fileformat=VCFv4.5\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tnull_sample" | bgzip > assets/HGDP+1kGP_ALL.hg38.vcf.gz
 tar --zstd -xOf assets/pgsc_HGDP+1kGP_v1.tar.zst GRCh38_HGDP+1kGP_ALL.pvar.zst | zstd -d | awk 'OFS="\t" {if ($1 ~ /^[0-9]/) print "chr"$1, $2, ".", $4, $5, ".", ".", ".", "GT", "0/0"}' | bgzip >> assets/HGDP+1kGP_ALL.hg38.vcf.gz
 tabix -p vcf assets/HGDP+1kGP_ALL.hg38.vcf.gz

 # There are 81144432 variants in the resulting VCF. By default, pgscalc requires us to pull at least 75% of these from
 # our WGS gVCFs. For folks with microarray or exome-seq variants, this is accomplished by imputation. Or they can reduce
 # the "--min_overlap 0.75" parameter though the pgscalc authors recommend against it:
 # https://pgsc-calc.readthedocs.io/en/latest/explanation/match.html#adjusting-min-overlap-is-a-bad-idea

 # Let's choose these 11 diseases to calculate PRS for, using weights available from PGS catalog.
 # PMID: 29273806, Asthma, PGS Catalog ID: PGS002727
 # PMID: 30655379, Type I Diabetes, PGS Catalog ID: PGS000024
 # PMID: 34594039, Type II Diabetes, PGS Catalog ID: PGS002308
 # PMID: 39323095, Atrial Fibrillation, PGS Catalog ID: PGS005072
 # PMID: 31152163, Chronic Kidney Disease, PGS Catalog ID: PGS002237
 # PMID: 30554720, Breast Cancer, PGS Catalog ID: PGS000004
 # PMID: 35915156, Coronary Heart Disease, PGS Catalog ID: PGS003446
 # PMID: 34887591, Hypercholesterolemia, PGS Catalog ID: PGS000889
 # PMID: 33398198, Prostate Cancer, PGS Catalog ID: PGS000662
 # PMID: 38508198, BMI (Body Mass Index), PGS Catalog ID: PGS004736
 # PMID: 38872215, Colorectal cancer, PGS Catalog ID: PGS004912

 # Download the hg38 FASTA compatible with our hg38 data.
 wget -P assets https://ilmn-dragen-giab-samples.s3.amazonaws.com/FASTA/hg38.fa
 bgzip --threads 2 assets/hg38.fa && samtools faidx assets/hg38.fa.gz

 # ----- #
 # INPUT #
 # ----- #

 # For testing, we'll download some publicly available gVCFs from Illumina. But you can try your own WGS gVCFs. If you are
 # starting with FASTQs then I strongly recommend Illumina's Dragen PAYG images on Azure or AWS. Here are instructions on
 # how I used Azure NP10 VMs at a cost of less than $20 per sample from WGS FASTQ to genome-wide gVCFs:
 # https://gist.github.com/ckandoth/f6a52ae704a170f9b67e80e0aa5d4b23

 # Visit basespace.illumina.com and login. Create a free account if needed. Then visit basespace.illumina.com/s/htWXpgEKrRu6
 # and click "Accept" to add the demo WGS data to your BaseSpace account. Use "bs auth" to login before using command below.

 # Download WGS gVCFs generated by Dragen 4.3.13 on the GIAB Ashkenazi Jewish Trio (HG002, HG003, HG004):
 mkdir data
 bs list dataset --terse --project-name "TruSeq-PCRfree-HG001-HG007-10B-2-v13" --is-type "illumina.dragen.complete.v0.4.3" --filter-term "TSPF-HG002-10B-2-v13-Rep1|TSPF-HG003-10B-2-v13-Rep1|TSPF-HG004-10B-2-v13-Rep1" |\
    xargs -L1 bs contents dataset --terse --extension gvcf.gz,gvcf.gz.tbi --id |\
    xargs -L1 bs download file --no-metadata -o data --id

 # -------- #
 # ANALYSIS #
 # -------- #

 # Convert the gVCFs into VCFs with only FMT/GT, subset to Ancestry sites, trim ALT alleles, realign indels, fix ref alleles, and remove duplicates.
 find data -name "*.hard-filtered.gvcf.gz" |\
    parallel -j1 --dry-run "bcftools convert --no-version --threads 1 --gvcf2vcf --fasta-ref assets/hg38.fa.gz {} |\
    bcftools annotate --no-version --threads 1 --remove QUAL,INFO,^FORMAT/GT |\
    bcftools view --no-version --threads 1 --targets-file assets/HGDP+1kGP_ALL.hg38.vcf.gz --targets-overlap 2 --trim-alt-alleles --no-update |\
    bcftools norm --no-version --threads 1 --rm-dup all --check-ref s --fasta-ref assets/hg38.fa.gz |\
    bcftools sort --output-type z --write-index=tbi --output {= s/.hard-filtered.gvcf.gz$/.pgsc.vcf.gz/ =}"

 # Dragen gVCFs might use IUPAC code for some REF alleles (e.g. Y for C/T at chr13:100972571), whereas our Ancestry VCF
 # uses REF allele N. This breaks bcftools merge below, so that's why we need to include "bcftools norm --check-ref s"
 # in the unix pipes above. Also needed "--rm-dup all" to remove duplicate entries that can happen when using Dragen's
 # targeted caller for paralogous regions or the force-genotying feature. The pipeline above takes around 1 hour to process
 # a single gVCF. If you have more CPU cores, then increase the "--threads" parameters and/or the "-j" parameter for parallel.

 # Merge per-sample VCFs into a multi-sample VCF with only GT fields. Use the null_sample to set GT=./. for missing variants.
 mkdir msvcfs
 find data -name "*.pgsc.vcf.gz" |\
    xargs bcftools merge --no-version --threads 1 --filter-logic x --merge none assets/HGDP+1kGP_ALL.hg38.vcf.gz |\
    bcftools norm --no-version --threads 1 --rm-dup all |\
    bcftools view --no-version --threads 1 --samples ^null_sample --no-update --output-type z --write-index=tbi --output msvcfs/ajtrio.pgsc.hg38.vcf.gz

 # 81269802 entries are in the resulting multi-sample VCF, which is more than the 81144432 Ancestry variants. Some sites
 # with different alleles are saved as separate entries. But we expect pgscatalog-match to match up variants properly:
 # https://pygscatalog.readthedocs.io/en/latest/how-to/guides/match.html

 # Make a CSV compatible with pgscalc:
 echo -e "sampleset,path_prefix,chrom,format\najtrio,/mnt/drop/prs/msvcfs/ajtrio.pgsc.hg38,,vcf" > msvcfs/ajtrio.pgsc.csv

 # Create a nextflow config to optimize pgscalc for your machine (pgsc-calc.readthedocs.io/en/latest/how-to/bigjob.html)
 # The config below allows two "process_low" jobs to run in parallel on an Azure FX2ms_v2 VM (2-cores, 42 GB RAM) or one
 # "process_medium" job at a time. The MATCH_VARIANTS step (process_medium) needs more RAM to handle more data. It needed
 # 34 GB RAM for this trio VCF with 81M variants, and needed 102 GB RAM to handle a 336-sample VCF with 84M variants. If
 # you don't have that much RAM to spare, you must split the VCF by chromosome following pgscalc docs.
 cat > msvcfs/ajtrio.pgsc.config << 'EOF'
 process {
    executor = 'local'
    withLabel:process_low {
        cpus   = 1
        memory = 20.GB
    }
    withLabel:process_medium {
        cpus   = 2
        memory = 40.GB
    }
 }
 EOF

 # Run pgscalc on this VCF using the PGS Catalog IDs for diseases shortlisted earlier. Set CPUs/Mem per your machine specs.
 mkdir -p pgscalc/ajtrio
 nextflow -log pgscalc/ajtrio.log run pgscatalog/pgsc_calc -revision v2.1.0 -profile docker -c msvcfs/ajtrio.pgsc.config -work-dir pgscalc/nf --max_cpus 2 --max_memory 40.GB --input msvcfs/ajtrio.pgsc.csv --outdir pgscalc/ajtrio --target_build GRCh38 --pgs_id PGS002727,PGS000024,PGS002308,PGS005072,PGS002237,PGS000004,PGS003446,PGS000889,PGS000662,PGS004736,PGS004912 --run_ancestry assets/pgsc_HGDP+1kGP_v1.tar.zst

 # pgscalc reports an 86.6% match between the input VCF and the HGDP+1000g variants. This is over their recommended
 # "--min_overlap" of 75%. The match % to sites in each scoring file is also over 75% except for Type 1 Diabetes (52%),
 # which may have sites not in the ancestry DB.

 # ::TODO:: Add hg38 loci from the PGS catalog variants to improve the match % to the scoring files.

 # ::TODO:: We skipped alleles in sex chromosomes to avoid the requirement of sex information by plink2 in pgscalc. But
 # there are 8 chrX alleles for prostate cancer that we can restore using inferred biological sex from the upstream pipeline.
	# GOAL: Prototype a bioinformatics pipeline to calculate Polygenic Risk Scores (PRS) using WGS gVCFs.

	# Let's use pgscalc (https://github.com/pgscatalog/pgsc_calc) a nextflow pipeline to calculate PRS, given PR weights and
	# a multi-sample VCF. Variant allele weights can be specified either as PGS Catalog IDs (--pgs_id) and/or as custom scoring
	# files (--scorefile). pgscalc can also perform liftover if scoring files use a different reference genome build than the
	# input VCFs (--liftover --target_build).

	# Steps below were tested on Ubuntu 24.04. But should work fine with any Linux server using bash.

	# ----- #
	# TOOLS #
	# ----- #

	# Download and install conda to help us manage our tools and dependencies.
	wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
	bash /tmp/miniconda.sh -bup ~/miniconda3 && rm /tmp/miniconda.sh

	# Modify ~/.bashrc to load conda on login, and then exit.
	cat >> ~/.bashrc << 'EOF'
	# Add conda to PATH if found
	if [ -f "$HOME/miniconda3/etc/profile.d/conda.sh" ]; then
	. $HOME/miniconda3/etc/profile.d/conda.sh
	fi
	EOF
	exit

	# Log back in, accept conda TOS, update it to the latest version, and configure it to use libmamba, a faster dependency solver.
	conda tos accept --channel defaults
	conda update -y conda
	conda config --set solver libmamba

	# Use conda to install the tools we need and their dependencies.
	conda create -y -n prs -c bioconda -c conda-forge -c defaults bcftools==1.20 samtools==1.20 htslib==1.20 parallel==20250622 nextflow==25.04.6

	# bs (BaseSpace CLI) cannot be installed using conda, so let's just do it manually.
	curl -L https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs -o $HOME/miniconda3/envs/prs/bin/bs
	chmod +x $HOME/miniconda3/envs/prs/bin/bs

	# Everytime we resume work on this project, we'll need to make sure we activate this conda env.
	conda activate prs

	# ------ #
	# ASSETS #
	# ------ #

	# Download the recommended ancestry reference database (HGDP + 1000 Genomes in hg38) for use with pgscalc.
	mkdir assets
	wget -P assets https://ftp.ebi.ac.uk/pub/databases/spot/pgs/resources/pgsc_HGDP+1kGP_v1.tar.zst

	# Make a VCF of all autosomal ancestry alleles. Add a fake sample "null_sample" for use with "bcftools merge" later.
	echo -e "##fileformat=VCFv4.5\n##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tnull_sample" \| bgzip > assets/HGDP+1kGP_ALL.hg38.vcf.gz
	tar --zstd -xOf assets/pgsc_HGDP+1kGP_v1.tar.zst GRCh38_HGDP+1kGP_ALL.pvar.zst \| zstd -d \| awk 'OFS="\t" {if ($1 ~ /^[0-9]/) print "chr"$1, $2, ".", $4, $5, ".", ".", ".", "GT", "0/0"}' \| bgzip >> assets/HGDP+1kGP_ALL.hg38.vcf.gz
	tabix -p vcf assets/HGDP+1kGP_ALL.hg38.vcf.gz

	# There are 81144432 variants in the resulting VCF. By default, pgscalc requires us to pull at least 75% of these from
	# our WGS gVCFs. For folks with microarray or exome-seq variants, this is accomplished by imputation. Or they can reduce
	# the "--min_overlap 0.75" parameter though the pgscalc authors recommend against it:
	# https://pgsc-calc.readthedocs.io/en/latest/explanation/match.html#adjusting-min-overlap-is-a-bad-idea

	# Let's choose these 11 diseases to calculate PRS for, using weights available from PGS catalog.
	# PMID: 29273806, Asthma, PGS Catalog ID: PGS002727
	# PMID: 30655379, Type I Diabetes, PGS Catalog ID: PGS000024
	# PMID: 34594039, Type II Diabetes, PGS Catalog ID: PGS002308
	# PMID: 39323095, Atrial Fibrillation, PGS Catalog ID: PGS005072
	# PMID: 31152163, Chronic Kidney Disease, PGS Catalog ID: PGS002237
	# PMID: 30554720, Breast Cancer, PGS Catalog ID: PGS000004
	# PMID: 35915156, Coronary Heart Disease, PGS Catalog ID: PGS003446
	# PMID: 34887591, Hypercholesterolemia, PGS Catalog ID: PGS000889
	# PMID: 33398198, Prostate Cancer, PGS Catalog ID: PGS000662
	# PMID: 38508198, BMI (Body Mass Index), PGS Catalog ID: PGS004736
	# PMID: 38872215, Colorectal cancer, PGS Catalog ID: PGS004912

	# Download the hg38 FASTA compatible with our hg38 data.
	wget -P assets https://ilmn-dragen-giab-samples.s3.amazonaws.com/FASTA/hg38.fa
	bgzip --threads 2 assets/hg38.fa && samtools faidx assets/hg38.fa.gz

	# ----- #
	# INPUT #
	# ----- #

	# For testing, we'll download some publicly available gVCFs from Illumina. But you can try your own WGS gVCFs. If you are
	# starting with FASTQs then I strongly recommend Illumina's Dragen PAYG images on Azure or AWS. Here are instructions on
	# how I used Azure NP10 VMs at a cost of less than $20 per sample from WGS FASTQ to genome-wide gVCFs:
	# https://gist.github.com/ckandoth/f6a52ae704a170f9b67e80e0aa5d4b23

	# Visit basespace.illumina.com and login. Create a free account if needed. Then visit basespace.illumina.com/s/htWXpgEKrRu6
	# and click "Accept" to add the demo WGS data to your BaseSpace account. Use "bs auth" to login before using command below.

	# Download WGS gVCFs generated by Dragen 4.3.13 on the GIAB Ashkenazi Jewish Trio (HG002, HG003, HG004):
	mkdir data
	bs list dataset --terse --project-name "TruSeq-PCRfree-HG001-HG007-10B-2-v13" --is-type "illumina.dragen.complete.v0.4.3" --filter-term "TSPF-HG002-10B-2-v13-Rep1\|TSPF-HG003-10B-2-v13-Rep1\|TSPF-HG004-10B-2-v13-Rep1" \|\
	xargs -L1 bs contents dataset --terse --extension gvcf.gz,gvcf.gz.tbi --id \|\
	xargs -L1 bs download file --no-metadata -o data --id

	# -------- #
	# ANALYSIS #
	# -------- #

	# Convert the gVCFs into VCFs with only FMT/GT, subset to Ancestry sites, trim ALT alleles, realign indels, fix ref alleles, and remove duplicates.
	find data -name "*.hard-filtered.gvcf.gz" \|\
	parallel -j1 --dry-run "bcftools convert --no-version --threads 1 --gvcf2vcf --fasta-ref assets/hg38.fa.gz {} \|\
	bcftools annotate --no-version --threads 1 --remove QUAL,INFO,^FORMAT/GT \|\
	bcftools view --no-version --threads 1 --targets-file assets/HGDP+1kGP_ALL.hg38.vcf.gz --targets-overlap 2 --trim-alt-alleles --no-update \|\
	bcftools norm --no-version --threads 1 --rm-dup all --check-ref s --fasta-ref assets/hg38.fa.gz \|\
	bcftools sort --output-type z --write-index=tbi --output {= s/.hard-filtered.gvcf.gz$/.pgsc.vcf.gz/ =}"

	# Dragen gVCFs might use IUPAC code for some REF alleles (e.g. Y for C/T at chr13:100972571), whereas our Ancestry VCF
	# uses REF allele N. This breaks bcftools merge below, so that's why we need to include "bcftools norm --check-ref s"
	# in the unix pipes above. Also needed "--rm-dup all" to remove duplicate entries that can happen when using Dragen's
	# targeted caller for paralogous regions or the force-genotying feature. The pipeline above takes around 1 hour to process
	# a single gVCF. If you have more CPU cores, then increase the "--threads" parameters and/or the "-j" parameter for parallel.

	# Merge per-sample VCFs into a multi-sample VCF with only GT fields. Use the null_sample to set GT=./. for missing variants.
	mkdir msvcfs
	find data -name "*.pgsc.vcf.gz" \|\
	xargs bcftools merge --no-version --threads 1 --filter-logic x --merge none assets/HGDP+1kGP_ALL.hg38.vcf.gz \|\
	bcftools norm --no-version --threads 1 --rm-dup all \|\
	bcftools view --no-version --threads 1 --samples ^null_sample --no-update --output-type z --write-index=tbi --output msvcfs/ajtrio.pgsc.hg38.vcf.gz

	# 81269802 entries are in the resulting multi-sample VCF, which is more than the 81144432 Ancestry variants. Some sites
	# with different alleles are saved as separate entries. But we expect pgscatalog-match to match up variants properly:
	# https://pygscatalog.readthedocs.io/en/latest/how-to/guides/match.html

	# Make a CSV compatible with pgscalc:
	echo -e "sampleset,path_prefix,chrom,format\najtrio,/mnt/drop/prs/msvcfs/ajtrio.pgsc.hg38,,vcf" > msvcfs/ajtrio.pgsc.csv

	# Create a nextflow config to optimize pgscalc for your machine (pgsc-calc.readthedocs.io/en/latest/how-to/bigjob.html)
	# The config below allows two "process_low" jobs to run in parallel on an Azure FX2ms_v2 VM (2-cores, 42 GB RAM) or one
	# "process_medium" job at a time. The MATCH_VARIANTS step (process_medium) needs more RAM to handle more data. It needed
	# 34 GB RAM for this trio VCF with 81M variants, and needed 102 GB RAM to handle a 336-sample VCF with 84M variants. If
	# you don't have that much RAM to spare, you must split the VCF by chromosome following pgscalc docs.
	cat > msvcfs/ajtrio.pgsc.config << 'EOF'
	process {
	executor = 'local'
	withLabel:process_low {
	cpus = 1
	memory = 20.GB
	}
	withLabel:process_medium {
	cpus = 2
	memory = 40.GB
	}
	}
	EOF

	# Run pgscalc on this VCF using the PGS Catalog IDs for diseases shortlisted earlier. Set CPUs/Mem per your machine specs.
	mkdir -p pgscalc/ajtrio
	nextflow -log pgscalc/ajtrio.log run pgscatalog/pgsc_calc -revision v2.1.0 -profile docker -c msvcfs/ajtrio.pgsc.config -work-dir pgscalc/nf --max_cpus 2 --max_memory 40.GB --input msvcfs/ajtrio.pgsc.csv --outdir pgscalc/ajtrio --target_build GRCh38 --pgs_id PGS002727,PGS000024,PGS002308,PGS005072,PGS002237,PGS000004,PGS003446,PGS000889,PGS000662,PGS004736,PGS004912 --run_ancestry assets/pgsc_HGDP+1kGP_v1.tar.zst

	# pgscalc reports an 86.6% match between the input VCF and the HGDP+1000g variants. This is over their recommended
	# "--min_overlap" of 75%. The match % to sites in each scoring file is also over 75% except for Type 1 Diabetes (52%),
	# which may have sites not in the ancestry DB.

	# ::TODO:: Add hg38 loci from the PGS catalog variants to improve the match % to the scoring files.

	# ::TODO:: We skipped alleles in sex chromosomes to avoid the requirement of sex information by plink2 in pgscalc. But
	# there are 8 chrX alleles for prostate cancer that we can restore using inferred biological sex from the upstream pipeline.