Created
April 3, 2025 03:02
-
-
Save ckandoth/4006866209475ae558ead88a53e6b59f to your computer and use it in GitHub Desktop.
Use an Azure NP10 Dragen PAYG server to align FASTQs in blob storage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -uo pipefail | |
error() { | |
echo "Error: $1" >&2 | |
exit 1 | |
} | |
if (( $# != 3 )); then | |
cat >&2 << EOM_USAGE | |
Usage: ./align_fastqs.sh [FASTQ_BLOB_DIR] [REF_BLOB_DIR] [OUTPUT_BLOB_DIR] | |
Purpose: Process a single sample whose FASTQs are stored in a blob storage folder, and upload results back to blob storage | |
Command-line arguments: | |
FASTQ_BLOB_DIR - e.g. "fqs/ajtrio/mom" where "fqs" is the container and "ajtrio/mom" is the subfolder containing FASTQs to process | |
REF_BLOB_DIR - e.g. "ref/hg38" where "ref" is the container and "hg38" is the subfolder containing Dragen reference data as a tar file | |
OUTPUT_BLOB_DIR - e.g. "dgn/ajtrio-mom" where "dgn" is the container and "ajtrio-mom" is the sample name and subfolder to upload outputs | |
Environment variables: | |
AZURE_STORAGE_ACCOUNT - the name of the ADLS Gen2 storage account we will use | |
AZURE_STORAGE_KEY - one of the keys returned by "az storage account keys list" | |
EOM_USAGE | |
exit 1 | |
fi | |
if [[ -z "${AZURE_STORAGE_ACCOUNT:-}" || -z "${AZURE_STORAGE_KEY:-}" ]]; then | |
error "AZURE_STORAGE_ACCOUNT and/or AZURE_STORAGE_KEY environment variables are not set" | |
fi | |
FASTQ_BLOB_DIR=$1 | |
REF_BLOB_DIR=$2 | |
OUTPUT_BLOB_DIR=$3 | |
# Piece together the full blob storage URLs we will need | |
STORAGE_ACCT_ENDPOINT=https://${AZURE_STORAGE_ACCOUNT}.blob.core.windows.net | |
FASTQ_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${FASTQ_BLOB_DIR}" | |
REF_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${REF_BLOB_DIR}" | |
OUTPUT_BLOB_URL="${STORAGE_ACCT_ENDPOINT}/${OUTPUT_BLOB_DIR}" | |
# Check for available space in /mnt and create some directories we'll need | |
REQUIRED_GB=400 | |
sudo chown -R $USER:$GROUPS /mnt || error "Failed to change ownership of /mnt" | |
AVAILABLE_KB=$(df --output=avail /mnt | tail -n1) | |
AVAILABLE_GB=$((AVAILABLE_KB / 1000 / 1000)) | |
if ((AVAILABLE_GB < REQUIRED_GB)); then | |
error "Insufficient disk space in /mnt. Required: $REQUIRED_GB GB, Available: $AVAILABLE_GB GB" | |
fi | |
mkdir /mnt/{fqs,tmp,dgn} || error "Failed to create directories in /mnt" | |
# Download FASTQs and parse headers to make a FASTQ list for use with Dragen | |
FQS_DIR="/mnt/${FASTQ_BLOB_DIR}" | |
SAMPLE=$(basename $OUTPUT_BLOB_DIR) | |
echo "Downloading FASTQs and creating a FASTQ list for Dragen under ${FQS_DIR}..." | |
FASTQ_FS=$(echo "${FASTQ_BLOB_DIR}" | cut -f1 -d/) | |
SAS_EXPIRY=$(date -u -d "20 mins" '+%Y-%m-%dT%H:%MZ') | |
FASTQ_SAS=$(az storage container generate-sas --name ${FASTQ_FS} --permissions lr --expiry ${SAS_EXPIRY} --https-only -o tsv) | |
azcopy cp "${FASTQ_BLOB_URL}/*?${FASTQ_SAS}" ${FQS_DIR} --output-level=quiet || error "Failed to download FASTQs" | |
FASTQ_LIST="${FQS_DIR}/fastq_list.csv" | |
echo "RGID,RGSM,RGLB,Lane,Read1File,Read2File" > ${FASTQ_LIST} | |
for fq1 in ${FQS_DIR}/*_R1*.fastq.gz; do | |
HEADER=$(gzip -dc "$fq1" | head -n1) | |
FLOWCELL=$(echo "$HEADER" | cut -f3 -d:) | |
LANE=$(echo "$HEADER" | cut -f4 -d:) | |
RGID="${FLOWCELL}.${LANE}.${SAMPLE}" | |
fq2=$(echo "$fq1" | sed 's/_R1/_R2/') | |
if [[ -f "${fq2}" ]]; then | |
echo "$RGID,$SAMPLE,UnknownLibrary,$LANE,$fq1,$fq2" >> ${FASTQ_LIST} | |
else | |
error "Could not find R2 FASTQ for $fq1" | |
fi | |
done | |
# Download reference data if it wasn't already downloaded by a previous run of this script | |
REF_DIR="/mnt/${REF_BLOB_DIR}" | |
if [[ ! -d "${REF_DIR}" ]]; then | |
echo "Downloading reference data into ${REF_DIR}..." | |
REF_FS=$(echo "${REF_BLOB_DIR}" | cut -f1 -d/) | |
SAS_EXPIRY=$(date -u -d "10 mins" '+%Y-%m-%dT%H:%MZ') | |
REF_SAS=$(az storage container generate-sas --name ${REF_FS} --permissions lr --expiry ${SAS_EXPIRY} --https-only -o tsv) | |
azcopy cp "${REF_BLOB_URL}/*?${REF_SAS}" ${REF_DIR} --output-level=quiet || error "Failed to download reference data" | |
find "${REF_DIR}" -name "*.tar" -print0 | xargs -0 -I {} tar -xf {} -C "${REF_DIR}" || error "Unable to extract tar file" | |
find "${REF_DIR}" -name "*.tar" -delete | |
else | |
echo "Reusing existing reference data under ${REF_DIR}..." | |
fi | |
echo "Running Dragen on sample ${SAMPLE} using ${FASTQ_LIST}..." | |
OUTPUT_DIR="/mnt/${OUTPUT_BLOB_DIR}" | |
mkdir -p $OUTPUT_DIR | |
dragen --intermediate-results-dir /mnt/tmp --ref-dir "${REF_DIR}" --enable-map-align true --enable-map-align-output true --output-format CRAM --enable-duplicate-marking true --generate-sa-tags true --enable-sort true --soft-read-trimmers polyg,quality --trim-min-quality 2 --qc-coverage-ignore-overlaps true --enable-variant-caller true --vc-emit-ref-confidence GVCF --vc-enable-vcf-output true --vc-combine-phased-variants-distance 6 --enable-targeted true --targeted-merge-vc true --enable-sv true --enable-cnv true --cnv-interval-width 1000 --cnv-enable-self-normalization true --cnv-enable-gcbias-correction true --cnv-counts-method start --cnv-filter-qual 20 --cnv-enable-segdups-extension true --cnv-enable-tracks false --enable-hla true --hla-enable-class-2 true --repeat-genotype-enable true --fastq-list "${FASTQ_LIST}" --fastq-list-sample-id "${SAMPLE}" --output-directory "${OUTPUT_DIR}" --output-file-prefix "${SAMPLE}" || error "Dragen run failed." | |
echo "Dragen run successful. Uploading outputs to ${OUTPUT_BLOB_URL}..." | |
OUT_FS=$(echo "${OUTPUT_BLOB_DIR}" | cut -f1 -d/) | |
SAS_EXPIRY=$(date -u -d "20 mins" '+%Y-%m-%dT%H:%MZ') | |
OUTPUT_SAS=$(az storage container generate-sas --name ${OUT_FS} --permissions cw --expiry ${SAS_EXPIRY} --https-only -o tsv) | |
azcopy cp "${OUTPUT_DIR}/*" "${OUTPUT_BLOB_URL}?${OUTPUT_SAS}" --output-level=quiet || error "Failed to upload outputs" | |
rm -rf /mnt/{fqs,tmp,dgn} || error "Failed to delete directories in /mnt" | |
echo "Finished with sample ${SAMPLE}." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment