Skip to content

Instantly share code, notes, and snippets.

@peterk87
Last active April 10, 2025 15:18
Show Gist options
  • Save peterk87/ce19e9994f9a135db6023458a96d8158 to your computer and use it in GitHub Desktop.
Save peterk87/ce19e9994f9a135db6023458a96d8158 to your computer and use it in GitHub Desktop.
Create ZIP file of selected nf-flu results
#!/bin/bash
set -euo pipefail
SAMPLESHEET="samplesheet.csv"
OUTPUT_DIR="results"
ZIP_PACKAGE_DIR="$(date -I)-nf-flu-results"
VERBOSE=false
OVERWRITE=false
# Help message
print_help() {
echo "Usage: $0 -s <samplesheet> -o <output_dir> -d <zip_package_dir>"
echo
echo "Create a zip package of selected nf-flu results given a samplesheet CSV"
echo
echo "Options:"
echo " -s SAMPLESHEET Samplesheet file in CSV format. Used for getting sample names. First column should contain sample names (default: $SAMPLESHEET)"
echo " -o OUTPUT_DIR Output directory containing the nf-flu results (default: $OUTPUT_DIR)"
echo " -d ZIP_PACKAGE_DIR Directory to store the zip package (default: $ZIP_PACKAGE_DIR)"
echo " -O Overwrite output files (default: $OVERWRITE)"
echo " -v Verbose output"
echo " -h Display this help message"
}
# Parse command line options
while getopts "s:o:d:Ohv" opt; do
case $opt in
s) SAMPLESHEET=$OPTARG ;;
o) OUTPUT_DIR=$OPTARG ;;
d) ZIP_PACKAGE_DIR=$OPTARG ;;
O) OVERWRITE=true ;;
v) VERBOSE=true ;;
h) print_help; exit 0 ;;
\?) echo "Invalid option: -$OPTARG" >&2 ;;
esac
done
error_handler() {
echo -e "\n\033[1;31mError on line $1\033[0m"
# Perform any cleanup or logging here
}
cleanup() {
echo "Cleaning up before exiting..."
echo "Removing temporary directory at $ZIP_PACKAGE_DIR"
rm -rf "$ZIP_PACKAGE_DIR"
}
handle_interrupt() {
echo -e "\n\033[1;31mERROR:\033[1m Script interrupted...\033[0m"
cleanup
exit 1
}
# Trap ERR signal to handle errors
trap 'error_handler $LINENO' ERR
# Trap EXIT signal to perform cleanup
trap cleanup EXIT
# Trap SIGINT and SIGTERM to handle interruptions
trap handle_interrupt SIGINT SIGTERM
error() {
echo -e "$(date -Is) \033[1;31mERROR: \033[0m\033[1m$1\033[0m"
}
warning() {
echo -e "$(date -Is) \033[1;33mWARNING: \033[0m\033[1m$1\033[0m"
}
info() {
echo -e "$(date -Is) \033[1;32mINFO: \033[0m\033[1m$1\033[0m"
}
if [[ ! -f "$SAMPLESHEET" ]]; then
error "Samplesheet not found at $SAMPLESHEET"
exit 1
fi
if [[ ! -d "$OUTPUT_DIR" ]]; then
error "Output directory not found at $OUTPUT_DIR"
exit 1
fi
if [[ -d "$ZIP_PACKAGE_DIR" ]]; then
error "Directory already exists at $ZIP_PACKAGE_DIR"
if [[ $OVERWRITE == false ]]; then
exit 1
fi
fi
info "Reading samplesheet from $SAMPLESHEET"
info "Output directory: $OUTPUT_DIR"
info "Creating temporary directory at $ZIP_PACKAGE_DIR"
# from samplesheet.csv read column after header in first row into Bash array
samples=($(awk -F, 'NR>1 {print $1}' "$SAMPLESHEET" | uniq))
# join sample names with ; delimiter
samples_joined=$(printf "; %s" "${samples[@]}")
info "Found the following samples: ${samples_joined:2}"
# Create a temporary directory to store the results
mkdir -p "$ZIP_PACKAGE_DIR"
if [ -f "$ZIP_PACKAGE_DIR.zip" ]; then
error "Zip package already exists at $ZIP_PACKAGE_DIR.zip!"
if [[ $OVERWRITE == false ]]; then
exit 1
fi
fi
subtyping_report=$(realpath "$OUTPUT_DIR/nf-flu-subtyping-report.xlsx")
if [[ -z "$subtyping_report" ]]; then
error "Subtyping report not found"
else
if [[ $VERBOSE == true ]]; then
info "Found subtyping report at $subtyping_report"
fi
ln -s "$subtyping_report" "$ZIP_PACKAGE_DIR/nf-flu-subtyping-report.xlsx"
fi
nextclade_tsv=$(realpath "$OUTPUT_DIR/nextclade/nextclade.tsv")
if [[ -z "$nextclade_tsv" ]]; then
error "Nextclade TSV not found"
else
if [[ $VERBOSE == true ]]; then
info "Found subtyping report at $nextclade_tsv"
fi
ln -s "$nextclade_tsv" "$ZIP_PACKAGE_DIR/nextclade.tsv"
fi
flumut_outputs=($(realpath "$OUTPUT_DIR/flumut/"*))
if [[ $VERBOSE == true ]]; then
info "Found ${#flumut_outputs[@]} Flumut outputs"
info "1st Flumut output: ${flumut_outputs[0]}"
info "1st Flumut output realpath: $(realpath ${flumut_outputs[0]})"
fi
mkdir -p "$ZIP_PACKAGE_DIR/FluMut"
for flumut_out in "${flumut_outputs[@]}"; do
ln -s "$flumut_out" "$ZIP_PACKAGE_DIR/FluMut/$(basename $flumut_out)"
done
# Symlink selected results to the temporary directory
for sample in "${samples[@]}"; do
if [[ $VERBOSE == true ]]; then
info "Symlinking sample '$sample' results to $ZIP_PACKAGE_DIR"
fi
# Find the consensus sequence for the sample getting the absolute path
consensus=$(find "$OUTPUT_DIR/consensus/bcftools" -name "${sample}.consensus.fasta")
if [[ -z "$consensus" ]]; then
warning "Consensus sequence not found for sample '$sample'"
else
if [[ $VERBOSE == true ]]; then
info "Found consensus sequence for sample '$sample' at $consensus"
fi
mkdir -p "$ZIP_PACKAGE_DIR/$sample/"{Annotations,FluMut,Variants,GenoFLU}
ln -sf $(realpath "$consensus") "$ZIP_PACKAGE_DIR/$sample/${sample}.consensus.fasta"
ln -sf $(realpath "$subtyping_report") "$ZIP_PACKAGE_DIR/$sample/${sample}-nf-flu-subtyping-report.xlsx"
for flumut_out in "${flumut_outputs[@]}"; do
# if FluMut output file extension is '.fasta', skip it
if [[ $(basename "$flumut_out") == *.fasta ]]; then
continue
fi
ln -sf "$flumut_out" "$ZIP_PACKAGE_DIR/$sample/FluMut/${sample}-$(basename $flumut_out)"
done
vcfs=($(find "$OUTPUT_DIR/variants/" \( -name "${sample}.*.clair3.vcf.gz" -o -name "${sample}.*.freebayes.vcf" \) | uniq))
if [[ $VERBOSE == true ]]; then
info "Found ${#vcfs[@]} VCF files for sample '$sample'"
info "1st VCF: ${vcfs[0]}"
info "1st VCF realpath: $(realpath ${vcfs[0]})"
fi
for vcf in "${vcfs[@]}"; do
ln -sf $(realpath "$vcf") "$ZIP_PACKAGE_DIR/$sample/Variants/$(basename $vcf)"
done
if [[ -d "$OUTPUT_DIR/annotation/$sample" ]]; then
annotations=()
annotations+=($(find "$OUTPUT_DIR/annotation/$sample" -name "${sample}.*" | uniq))
if [[ -d "$OUTPUT_DIR/annotation/bcftools/$sample" ]]; then
annotations+=($(find "$OUTPUT_DIR/annotation/bcftools/$sample" -name "${sample}.*" | uniq))
fi
for annotation in "${annotations[@]}"; do
ln -sf $(realpath "$annotation") "$ZIP_PACKAGE_DIR/$sample/Annotations/$(basename $annotation)"
done
fi
if [[ -d "$OUTPUT_DIR/mismatch_report" ]]; then
mismatch_report=$(find "$OUTPUT_DIR/mismatch_report" -name "${sample}-blastn-report.xlsx" | uniq | head -n1)
if [[ -z "$mismatch_report" ]]; then
warning "BLASTN mismatch report not found for sample '$sample'"
else
if [[ $VERBOSE == true ]]; then
info "Found mismatch report for sample '$sample' at '$mismatch_report'"
fi
ln -sf $(realpath "$mismatch_report") "$ZIP_PACKAGE_DIR/$sample/$(basename $mismatch_report)"
fi
fi
if [[ -d "$OUTPUT_DIR/genoflu" ]]; then
genoflu_out=($(find "$OUTPUT_DIR/genoflu/" -name "${sample}.*" | uniq))
if [[ -z "$genoflu_out" ]]; then
warning "GenoFLU output not found for sample '$sample'"
else
if [[ $VERBOSE == true ]]; then
info "Found GenoFLU output for sample '$sample' at '$genoflu_out'"
fi
mkdir -p "$ZIP_PACKAGE_DIR/$sample/GenoFLU"
fi
for genoflu_out in "${genoflu_out[@]}"; do
ln -sf $(realpath "$genoflu_out") "$ZIP_PACKAGE_DIR/$sample/GenoFLU/$(basename $genoflu_out)"
done
fi
if [[ -d "$OUTPUT_DIR/nextclade" && -f "$OUTPUT_DIR/nextclade/nextclade.tsv" ]]; then
ln -sf $(realpath "$OUTPUT_DIR/nextclade/nextclade.tsv") "$ZIP_PACKAGE_DIR/$sample/${sample}.nextclade.tsv"
fi
fi
done
if [[ $VERBOSE == true ]]; then
info "Listing files in $ZIP_PACKAGE_DIR"
tree "$ZIP_PACKAGE_DIR" || true
fi
info "Creating zip package '$ZIP_PACKAGE_DIR.zip'..."
zip -r "$ZIP_PACKAGE_DIR.zip" "$ZIP_PACKAGE_DIR"
info "Created zip package at $(realpath "$ZIP_PACKAGE_DIR.zip")"
info "Creating tar.gz '${ZIP_PACKAGE_DIR}.tar.gz'"
tar --dereference -czf "${ZIP_PACKAGE_DIR}.tar.gz" "$ZIP_PACKAGE_DIR"
info "Created tar.gz package at $(realpath "$ZIP_PACKAGE_DIR.tar.gz")"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment