Skip to content

Instantly share code, notes, and snippets.

@indraniel
Last active June 30, 2023 03:45
Show Gist options
  • Select an option

  • Save indraniel/d1a199431c764bd2c6153e155520e930 to your computer and use it in GitHub Desktop.

Select an option

Save indraniel/d1a199431c764bd2c6153e155520e930 to your computer and use it in GitHub Desktop.
A bash script to annotate a large VCF (i.e. containing many samples) with gnomAD (build 37 edition)
#!/bin/bash
# Usage:
# bash b37-annotate-w-gnomAD.sh </path/to/input.vcf.gz> </path/to/output.vcf.gz>
#
# Note:
# 1. This script with create a "scratch" directory to hold intermediate file
# states created during the annotation process. The scratch directory will
# be created next to the /path/to/outvcf.vcf.gz .
#
# 2. This script assumes that the input vcf has been bgzipped and tabix has been
# run on the bgzipped vcf
#
# 3. Currently this scratch directory WILL NOT BE DELETED!
set -eo pipefail
# http://stackoverflow.com/questions/9893667/is-there-a-way-to-write-a-bash-function-which-aborts-the-whole-execution-no-mat
trap "exit 1" TERM
export TOP_PID=$$
AWK=/usr/bin/awk
LN=/bin/ln
UNIQ=/usr/bin/uniq
# Heng Li's modified sort (https://github.com/lh3/foreign/tree/master/sort)
SORT=/gscmnt/gc2802/halllab/idas/software/local/bin/lh3-sort
BGZIP=/gscmnt/gc2802/halllab/idas/software/local/bin/bgzip
TABIX=/gscmnt/gc2802/halllab/idas/software/local/bin/tabix
BCFTOOLS=/gscmnt/gc2802/halllab/idas/software/local/bin/bcftools1.4
function join_by { local IFS="$1"; shift; echo "$*"; }
function die {
local timestamp=$(date +"%Y-%m-%d %T")
echo "[ ${timestamp} ] ERROR: $@" >&2
kill -s TERM ${TOP_PID}
}
function log {
local timestamp=$(date +"%Y-%m-%d %T")
echo "---> [ ${timestamp} ] $@" >&2
}
function run_cmd {
local cmd=$1
log "EXEC: ${cmd}"
eval "${cmd}"
if [[ $? -ne 0 ]]; then
die "[err] Problem running command: ${cmd} !"
exit 1;
fi
}
function is_empty_vcf {
local invcf=$1
local count=$(${BCFTOOLS} view -H ${invcf} | head -n 1000 | wc -l)
if [[ "${count}" -gt "0" ]]; then
return 1
else
return 0
fi
}
function copy_over_vcf {
local invcf=$1
local outvcf=$2
cp -v ${invcf} ${outvcf}
cp -v ${invcf}.tbi ${outvcf}.tbi
}
function vcf_subtract_samples {
local vcf=$1
log "(vcf_subtract_samples) calling bcftools subprocess"
local cmd="
cat <(${BCFTOOLS} view -h ${vcf} | head -n -1) \
<(${BCFTOOLS} view -h ${vcf} | tail -n 1 | cut -f1-8) \
<(${BCFTOOLS} view -H ${vcf} | cut -f1-8)
"
run_cmd "${cmd}"
}
function vcf_add_samples {
local no_samples_vcf=$1
local samples_vcf=$2
log "(vcf_add_samples) calling bcftools subprocess"
local cmd="
cat <(${BCFTOOLS} view -h ${no_samples_vcf} | head -n -1) \
<(${BCFTOOLS} view -h ${samples_vcf} | tail -n 1 ) \
<(paste <(${BCFTOOLS} view -H ${no_samples_vcf}) \
<(${BCFTOOLS} view -H ${samples_vcf} | cut -f9-))
"
run_cmd "${cmd}"
}
function tabix_and_finalize_vcf {
local tmpvcf=$1
local finalvcf=$2
local cmd="
${TABIX} -p vcf -f ${tmpvcf} \
&& mv ${tmpvcf}.tbi ${finalvcf}.tbi \
&& mv ${tmpvcf} ${finalvcf}
"
run_cmd "${cmd}"
}
function get_chrom {
local region=$1
local chrom=
if echo "${region}" | grep -q ':'; then
chrom=$(echo ${region} | ${AWK} -F':' '{ print $1 }' | sed 's/chr//g')
#interval=$(echo ${region} | ${AWK} -F':' '{ print $2 }')
else
chrom=${region}
fi
if [[ -z "${chrom}" ]]; then
die "Could not obtain a chrom from region : ${region}"
fi
log "identified chrom '${chrom}' on region '${region}'"
echo ${chrom}
}
function prune_samples_on_b37_vcf {
local invcf=$1
local outdir=${2:-$(dirname ${invcf})}
local outvcf=${outdir}/b37.nosamples.vcf.gz
if [[ -e "${outvcf}" ]]; then
log "shortcutting prune_samples_on_b37_vcf"
echo ${outvcf}
return 0;
fi
local tmpvcf=${outvcf}.tmp
local cmd1="vcf_subtract_samples ${invcf} | ${BGZIP} -c > ${tmpvcf}"
run_cmd "${cmd1}"
tabix_and_finalize_vcf ${tmpvcf} ${outvcf}
echo ${outvcf}
}
function run_gnomAD_exome_annotation {
local invcf=$1
local outdir=$(dirname ${invcf})
local outvcf=${outdir}/b37-gnomAD-exome-annotation-final.vcf.gz
if [[ -e "${outvcf}" ]]; then
log "shortcutting run_gnomAD_exome_annotation"
echo ${outvcf}
return 0;
fi
# annotation columns of interest
local -a anno_cols=(
'ID'
'INFO/GNOMAD_EXOME_AC_AFR'
'INFO/GNOMAD_EXOME_AC_AMR'
'INFO/GNOMAD_EXOME_AC_ASJ'
'INFO/GNOMAD_EXOME_AC_EAS'
'INFO/GNOMAD_EXOME_AC_FIN'
'INFO/GNOMAD_EXOME_AC_NFE'
'INFO/GNOMAD_EXOME_AC_OTH'
'INFO/GNOMAD_EXOME_AC_SAS'
'INFO/GNOMAD_EXOME_AN_AFR'
'INFO/GNOMAD_EXOME_AN_AMR'
'INFO/GNOMAD_EXOME_AN_ASJ'
'INFO/GNOMAD_EXOME_AN_EAS'
'INFO/GNOMAD_EXOME_AN_FIN'
'INFO/GNOMAD_EXOME_AN_NFE'
'INFO/GNOMAD_EXOME_AN_OTH'
'INFO/GNOMAD_EXOME_AN_SAS'
'INFO/GNOMAD_EXOME_AC_raw'
'INFO/GNOMAD_EXOME_AN_raw'
'INFO/GNOMAD_EXOME_AC_POPMAX'
'INFO/GNOMAD_EXOME_AN_POPMAX'
'INFO/GNOMAD_EXOME_AS_RF'
'INFO/GNOMAD_EXOME_AS_FilterStatus'
'INFO/GNOMAD_EXOME_AS_RF_POSITIVE_TRAIN'
'INFO/GNOMAD_EXOME_AS_RF_NEGATIVE_TRAIN'
)
local annotations=$(join_by , "${anno_cols[@]}")
# the gnomAD exome annotation vcfs base location
local base=/gscmnt/gc2802/halllab/gnomAD/release-170228/processed/post-vqsr-pipeline/exome
# Figure out which gnomAD chromosomes vcfs are needed
local -a gnomAD_chroms=($(${TABIX} --list-chroms ${invcf} | ${SORT} -N | ${UNIQ} | grep -v -P '^(GL|MT)'))
log "[exome] gnomAD chromosomal vcfs to process: ${gnomAD_chroms[@]} ( items: ${#gnomAD_chroms[@]} )"
local enter_vcf=${invcf}
local exit_vcf=
for chr in ${gnomAD_chroms[@]}; do
log "Annotating with gnomAD exome chromosome: ${chr}"
exit_vcf=${outdir}/b37-gnomAD-exome-annotation-c${chr}.vcf.gz
# determine the annotation vcf
local anno_vcf=${base}/${chr}/
anno_vcf+=c${chr}.gnomad.exomes.r2.0.1.sites.decompose.normalized.uniq.namespaced.vcf.gz
# log "enter_vcf : ${enter_vcf}"
# log "exit_vcf : ${exit_vcf}"
if [[ ! -e ${anno_vcf} ]]; then
die "Did not find annotation vcf: '${anno_vcf}'"
fi
# perform the chromosomal annotation
local tmpvcf=${exit_vcf}.tmp
local cmd1="
${BCFTOOLS} annotate \
-a ${anno_vcf} \
-c ${annotations} \
-O z \
-o ${tmpvcf} \
${enter_vcf} \
"
run_cmd "${cmd1}"
tabix_and_finalize_vcf ${tmpvcf} ${exit_vcf}
# setup for the next iteration
enter_vcf=${exit_vcf}
done
log "final exome exit_vcf is: ${exit_vcf}"
local cmd2="mv ${exit_vcf}.tbi ${outvcf}.tbi && mv ${exit_vcf} ${outvcf}"
run_cmd "${cmd2}"
echo ${outvcf}
}
function run_gnomAD_genome_annotation {
local invcf=$1
local outdir=$(dirname ${invcf})
local outvcf=${outdir}/b37-gnomAD-genome-exome-annotation-final.vcf.gz
if [[ -e "${outvcf}" ]]; then
log "shortcutting run_gnomAD_genome_annotation"
echo ${outvcf}
return 0;
fi
# annotation columns of interest
local -a anno_cols=(
'ID'
'INFO/GNOMAD_GENOME_AC_AFR'
'INFO/GNOMAD_GENOME_AC_AMR'
'INFO/GNOMAD_GENOME_AC_ASJ'
'INFO/GNOMAD_GENOME_AC_EAS'
'INFO/GNOMAD_GENOME_AC_FIN'
'INFO/GNOMAD_GENOME_AC_NFE'
'INFO/GNOMAD_GENOME_AC_OTH'
'INFO/GNOMAD_GENOME_AN_AFR'
'INFO/GNOMAD_GENOME_AN_AMR'
'INFO/GNOMAD_GENOME_AN_ASJ'
'INFO/GNOMAD_GENOME_AN_EAS'
'INFO/GNOMAD_GENOME_AN_FIN'
'INFO/GNOMAD_GENOME_AN_NFE'
'INFO/GNOMAD_GENOME_AN_OTH'
'INFO/GNOMAD_GENOME_AC_raw'
'INFO/GNOMAD_GENOME_AN_raw'
'INFO/GNOMAD_GENOME_AC_POPMAX'
'INFO/GNOMAD_GENOME_AN_POPMAX'
'INFO/GNOMAD_GENOME_AS_RF'
'INFO/GNOMAD_GENOME_AS_FilterStatus'
'INFO/GNOMAD_GENOME_AS_RF_POSITIVE_TRAIN'
'INFO/GNOMAD_GENOME_AS_RF_NEGATIVE_TRAIN'
)
local annotations=$(join_by , "${anno_cols[@]}")
# the gnomAD genome annotation vcfs base location
local base=/gscmnt/gc2802/halllab/gnomAD/release-170228/processed/post-vqsr-pipeline/genome
# Figure out which gnomAD chromosomes vcfs are needed
local -a gnomAD_chroms=($(${TABIX} --list-chroms ${invcf} | ${SORT} -N | ${UNIQ} | grep -v -P '^(GL|MT)'))
log "[genome] gnomAD chromosomal vcfs to process: ${gnomAD_chroms[@]} ( items: ${#gnomAD_chroms[@]} )"
local enter_vcf=${invcf}
local exit_vcf=
for chr in ${gnomAD_chroms[@]}; do
log "Annotating with gnomAD genome chromosome: ${chr}"
exit_vcf=${outdir}/b37-gnomAD-genome-annotation-c${chr}.vcf.gz
# log "enter_vcf : ${enter_vcf}"
# log "exit_vcf : ${exit_vcf}"
# perform the chromosomal annotation
if [[ "${chr}" == "Y" ]]; then
# specially handle chromosome Y (there are no chr Y gnomAD annotations)
log "gnomAD genome special chromosome Y handling"
local cmd0="
${LN} -s ${enter_vcf} ${exit_vcf} && ${LN} -s ${enter_vcf}.tbi ${exit_vcf}.tbi
"
run_cmd "${cmd0}"
else
# determine the annotation vcf
local anno_vcf=${base}/${chr}/
anno_vcf+=c${chr}.gnomad.genomes.r2.0.1.sites.decompose.normalized.uniq.namespaced.vcf.gz
if [[ ! -e ${anno_vcf} ]]; then
die "Did not find annotation vcf: '${anno_vcf}'"
fi
local tmpvcf=${exit_vcf}.tmp
local cmd1="
${BCFTOOLS} annotate \
-a ${anno_vcf} \
-c ${annotations} \
-O z \
-o ${tmpvcf} \
${enter_vcf} \
"
run_cmd "${cmd1}"
tabix_and_finalize_vcf ${tmpvcf} ${exit_vcf}
fi
# setup for the next iteration
enter_vcf=${exit_vcf}
done
log "final genome exit_vcf is: ${exit_vcf}"
local cmd2="mv ${exit_vcf}.tbi ${outvcf}.tbi && mv ${exit_vcf} ${outvcf}"
run_cmd "${cmd2}"
echo ${outvcf}
}
function add_samples_on_b37_annotated_vcf {
local original_b37_input_vcf=$1
local b37_annotated_no_samples_vcf=$2
local final_b37_output_vcf=$3
if [[ -e "${final_b37_output_vcf}" ]]; then
log "shortcutting add_samples_on_b37_cadd_vcf"
return 0;
fi
local tmpvcf=${final_b37_output_vcf}.tmp
local cmd1="
vcf_add_samples ${b37_annotated_no_samples_vcf} ${original_b37_input_vcf} \
| ${BGZIP} -c \
> ${tmpvcf}
"
run_cmd "${cmd1}"
tabix_and_finalize_vcf ${tmpvcf} ${final_b37_output_vcf}
}
function annotate_vcf {
local b37_invcf=$1
local b37_outvcf=$2
local scratch_dir=$(dirname ${b37_outvcf})/scratch
mkdir -p ${scratch_dir}
log "Remove samples on b37 input vcf"
local b37_invcf_no_samples=$(prune_samples_on_b37_vcf ${b37_invcf} ${scratch_dir})
log "Entering run_gnomAD_exome_annotation"
local b37_gnomAD_exome_vcf=$(run_gnomAD_exome_annotation ${b37_invcf_no_samples})
log "Entering run_gnomAD_genome_annotation"
local b37_anno_vcf=$(run_gnomAD_genome_annotation ${b37_gnomAD_exome_vcf})
log "Add samples on b37 gnomAD annotated vcf"
add_samples_on_b37_annotated_vcf ${b37_invcf} ${b37_anno_vcf} ${b37_outvcf}
}
function main {
local invcf=$1
local outvcf=$2
if is_empty_vcf ${invcf} ; then
log "No variants to process. Copying files over..."
copy_over_vcf ${invcf} ${outvcf} ;
else
annotate_vcf ${invcf} ${outvcf} ;
fi
log 'All Done'
}
INVCF=$1
OUTVCF=$2
main ${INVCF} ${OUTVCF} ;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment