Skip to content

Instantly share code, notes, and snippets.

@ckandoth
Created April 3, 2025 03:02
Show Gist options
  • Save ckandoth/4006866209475ae558ead88a53e6b59f to your computer and use it in GitHub Desktop.
Save ckandoth/4006866209475ae558ead88a53e6b59f to your computer and use it in GitHub Desktop.
Use an Azure NP10 Dragen PAYG server to align FASTQs in blob storage
#!/bin/bash
set -uo pipefail
error() {
echo "Error: $1" >&2
exit 1
}
if (( $# != 3 )); then
cat >&2 << EOM_USAGE
Usage: ./align_fastqs.sh [FASTQ_BLOB_DIR] [REF_BLOB_DIR] [OUTPUT_BLOB_DIR]
Purpose: Process a single sample whose FASTQs are stored in a blob storage folder, and upload results back to blob storage
Command-line arguments:
FASTQ_BLOB_DIR - e.g. "fqs/ajtrio/mom" where "fqs" is the container and "ajtrio/mom" is the subfolder containing FASTQs to process
REF_BLOB_DIR - e.g. "ref/hg38" where "ref" is the container and "hg38" is the subfolder containing Dragen reference data as a tar file
OUTPUT_BLOB_DIR - e.g. "dgn/ajtrio-mom" where "dgn" is the container and "ajtrio-mom" is the sample name and subfolder to upload outputs
Environment variables:
AZURE_STORAGE_ACCOUNT - the name of the ADLS Gen2 storage account we will use
AZURE_STORAGE_KEY - one of the keys returned by "az storage account keys list"
EOM_USAGE
exit 1
fi
if [[ -z "${AZURE_STORAGE_ACCOUNT:-}" || -z "${AZURE_STORAGE_KEY:-}" ]]; then
error "AZURE_STORAGE_ACCOUNT and/or AZURE_STORAGE_KEY environment variables are not set"
fi
FASTQ_BLOB_DIR=$1
REF_BLOB_DIR=$2
OUTPUT_BLOB_DIR=$3
# Piece together the full blob storage URLs we will need
STORAGE_ACCT_ENDPOINT=https://${AZURE_STORAGE_ACCOUNT}.blob.core.windows.net
FASTQ_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${FASTQ_BLOB_DIR}"
REF_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${REF_BLOB_DIR}"
OUTPUT_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${OUTPUT_BLOB_DIR}"
# Check for available space in /mnt and create some directories we'll need
REQUIRED_GB=400
sudo chown -R $USER:$GROUPS /mnt || error "Failed to change ownership of /mnt"
AVAILABLE_KB=$(df --output=avail /mnt | tail -n1)
AVAILABLE_GB=$((AVAILABLE_KB / 1000 / 1000))
if ((AVAILABLE_GB < REQUIRED_GB)); then
error "Insufficient disk space in /mnt. Required: $REQUIRED_GB GB, Available: $AVAILABLE_GB GB"
fi
mkdir /mnt/{fqs,tmp,dgn} || error "Failed to create directories in /mnt"
# Download FASTQs and parse headers to make a FASTQ list for use with Dragen
FQS_DIR="/mnt/${FASTQ_BLOB_DIR}"
SAMPLE=$(basename $OUTPUT_BLOB_DIR)
echo "Downloading FASTQs and creating a FASTQ list for Dragen under ${FQS_DIR}..."
FASTQ_FS=$(echo "${FASTQ_BLOB_DIR}" | cut -f1 -d/)
SAS_EXPIRY=$(date -u -d "20 mins" '+%Y-%m-%dT%H:%MZ')
FASTQ_SAS=$(az storage container generate-sas --name ${FASTQ_FS} --permissions lr --expiry ${SAS_EXPIRY} --https-only -o tsv)
azcopy cp "${FASTQ_BLOB_URL}/*?${FASTQ_SAS}" ${FQS_DIR} --output-level=quiet || error "Failed to download FASTQs"
FASTQ_LIST="${FQS_DIR}/fastq_list.csv"
echo "RGID,RGSM,RGLB,Lane,Read1File,Read2File" > ${FASTQ_LIST}
for fq1 in ${FQS_DIR}/*_R1*.fastq.gz; do
HEADER=$(gzip -dc "$fq1" | head -n1)
FLOWCELL=$(echo "$HEADER" | cut -f3 -d:)
LANE=$(echo "$HEADER" | cut -f4 -d:)
RGID="${FLOWCELL}.${LANE}.${SAMPLE}"
fq2=$(echo "$fq1" | sed 's/_R1/_R2/')
if [[ -f "${fq2}" ]]; then
echo "$RGID,$SAMPLE,UnknownLibrary,$LANE,$fq1,$fq2" >> ${FASTQ_LIST}
else
error "Could not find R2 FASTQ for $fq1"
fi
done
# Download reference data if it wasn't already downloaded by a previous run of this script
REF_DIR="/mnt/${REF_BLOB_DIR}"
if [[ ! -d "${REF_DIR}" ]]; then
echo "Downloading reference data into ${REF_DIR}..."
REF_FS=$(echo "${REF_BLOB_DIR}" | cut -f1 -d/)
SAS_EXPIRY=$(date -u -d "10 mins" '+%Y-%m-%dT%H:%MZ')
REF_SAS=$(az storage container generate-sas --name ${REF_FS} --permissions lr --expiry ${SAS_EXPIRY} --https-only -o tsv)
azcopy cp "${REF_BLOB_URL}/*?${REF_SAS}" ${REF_DIR} --output-level=quiet || error "Failed to download reference data"
find "${REF_DIR}" -name "*.tar" -print0 | xargs -0 -I {} tar -xf {} -C "${REF_DIR}" || error "Unable to extract tar file"
find "${REF_DIR}" -name "*.tar" -delete
else
echo "Reusing existing reference data under ${REF_DIR}..."
fi
echo "Running Dragen on sample ${SAMPLE} using ${FASTQ_LIST}..."
OUTPUT_DIR="/mnt/${OUTPUT_BLOB_DIR}"
mkdir -p $OUTPUT_DIR
dragen --intermediate-results-dir /mnt/tmp --ref-dir "${REF_DIR}" --enable-map-align true --enable-map-align-output true --output-format CRAM --enable-duplicate-marking true --generate-sa-tags true --enable-sort true --soft-read-trimmers polyg,quality --trim-min-quality 2 --qc-coverage-ignore-overlaps true --enable-variant-caller true --vc-emit-ref-confidence GVCF --vc-enable-vcf-output true --vc-combine-phased-variants-distance 6 --enable-targeted true --targeted-merge-vc true --enable-sv true --enable-cnv true --cnv-interval-width 1000 --cnv-enable-self-normalization true --cnv-enable-gcbias-correction true --cnv-counts-method start --cnv-filter-qual 20 --cnv-enable-segdups-extension true --cnv-enable-tracks false --enable-hla true --hla-enable-class-2 true --repeat-genotype-enable true --fastq-list "${FASTQ_LIST}" --fastq-list-sample-id "${SAMPLE}" --output-directory "${OUTPUT_DIR}" --output-file-prefix "${SAMPLE}" || error "Dragen run failed."
echo "Dragen run successful. Uploading outputs to ${OUTPUT_BLOB_URL}..."
OUT_FS=$(echo "${OUTPUT_BLOB_DIR}" | cut -f1 -d/)
SAS_EXPIRY=$(date -u -d "20 mins" '+%Y-%m-%dT%H:%MZ')
OUTPUT_SAS=$(az storage container generate-sas --name ${OUT_FS} --permissions cw --expiry ${SAS_EXPIRY} --https-only -o tsv)
azcopy cp "${OUTPUT_DIR}/*" "${OUTPUT_BLOB_URL}?${OUTPUT_SAS}" --output-level=quiet || error "Failed to upload outputs"
rm -rf /mnt/{fqs,tmp,dgn} || error "Failed to delete directories in /mnt"
echo "Finished with sample ${SAMPLE}."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment