Last active
May 23, 2024 21:25
-
-
Save peterk87/54e29441a741265af7c98df1675b330f to your computer and use it in GitHub Desktop.
CFIA-NCFAD/nf-villumina Bash wrapper script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Function to handle Ctrl+C | |
handle_interrupt() { | |
echo -e "\n\033[1;31mERROR:\033[1m Script interrupted by user (Ctrl+C)\033[0m" | |
exit 1 | |
} | |
# Set up the trap to catch SIGINT (Ctrl+C) | |
trap handle_interrupt SIGINT | |
# Define default values | |
TOTAL_CPUS=$(nproc) | |
DEFAULT_KRAKEN2_DB="/opt/DB/kraken2/nt_20231129" | |
DEFAULT_CENTRIFUGE_DB="/opt/DB/centrifuge/nt-2020-02-04/nt" | |
DEFAULT_BLASTN_NT_DB="/opt/DB/blast/nt/nt" | |
# Get the amount of free memory in KB, convert to GB | |
FREE_MEM_KB=$(awk '/MemAvailable/ {print $2}' /proc/meminfo) | |
FREE_MEM_GB=$(echo "scale=2; $FREE_MEM_KB / 1024 / 1024" | bc) | |
# Function to print usage information | |
usage() { | |
echo "Usage: $0 [-h] [-n NCPUS] [-m MEM_TO_USE_GB] [BLASTN_NT_DB] [KRAKEN2_DB] [CENTRIFUGE_DB]" | |
echo "Options:" | |
echo " -h Display this help message" | |
echo " -n NCPUS Number of CPUs to use (default: total CPUs; $TOTAL_CPUS)" | |
echo " -m MEM_TO_USE_GB Amount of memory to use in GB (default: free memory; $FREE_MEM_GB GB)" | |
echo "Arguments:" | |
echo " BLASTN_NT_DB Path to BLASTN NT database (default: $DEFAULT_BLASTN_NT_DB)" | |
echo " KRAKEN2_DB Path to Kraken2 database (default: $DEFAULT_KRAKEN2_DB)" | |
echo " CENTRIFUGE_DB Path to Centrifuge database (default: $DEFAULT_CENTRIFUGE_DB)" | |
exit 0 | |
} | |
# Parse command-line options | |
while getopts ":hn:m:" opt; do | |
case ${opt} in | |
h) | |
usage | |
;; | |
n) | |
NCPUS=$OPTARG | |
;; | |
m) | |
MEM_TO_USE_GB=$OPTARG | |
;; | |
\?) | |
echo "Invalid option: -$OPTARG" >&2 | |
usage | |
;; | |
:) | |
echo "Option -$OPTARG requires an argument." >&2 | |
usage | |
;; | |
esac | |
done | |
shift $((OPTIND -1)) | |
NCPUS=${NCPUS:-$TOTAL_CPUS} | |
HALF_CPUS=$((NCPUS / 2)) | |
MEM_TO_USE_GB=${MEM_TO_USE_GB:-$FREE_MEM_GB} | |
# Set default database paths | |
BLASTN_NT_DB=${1:-$DEFAULT_BLASTN_NT_DB} | |
KRAKEN2_DB=${2:-$DEFAULT_KRAKEN2_DB} | |
CENTRIFUGE_DB=${3:-$DEFAULT_CENTRIFUGE_DB} | |
error() { | |
echo -e "$(date -Is) \033[1;31mERROR: \033[0m\033[1m$1\033[0m" | |
} | |
info() { | |
echo -e "$(date -Is) \033[1;32mINFO: \033[0m\033[1m$1\033[0m" | |
} | |
# Check if the reads directory exists and contains at least two .fastq.gz files | |
if [ ! -d "reads/" ] || [ $(ls reads/*.fastq.gz 2>/dev/null | wc -l) -lt 2 ]; then | |
error "'reads/' directory does not exist or contains fewer than two .fastq.gz files!" | |
exit 1 | |
fi | |
info "Using ${NCPUS} CPUs and ${MEM_TO_USE_GB} GB for nf-villumina analysis" | |
info "Creating nf-villumina.environment.yml" | |
cat > nf-villumina.environment.yml <<EOL | |
name: nf-villumina | |
channels: | |
- conda-forge | |
- bioconda | |
- defaults | |
dependencies: | |
- bbmap | |
- blast | |
- centrifuge-core | |
- curl | |
- fastp | |
- fastqc | |
- kraken2 | |
- mash | |
- megahit | |
- pbgzip | |
- samtools | |
- seqtk | |
- shovill | |
- spades | |
- unicycler | |
- openjdk=21 | |
- python | |
- pip | |
- pip: | |
- filter_classified_reads | |
- biopython | |
- click | |
- attrs | |
- numpy | |
- pandas | |
EOL | |
info "Creating nf-villumina Conda env" | |
conda env create -f nf-villumina.environment.yml | |
# Source the conda.sh script to use conda in the script | |
source "$(conda info --base)/etc/profile.d/conda.sh" | |
# Activate the conda environment | |
conda activate nf-villumina | |
# Check if the environment was activated successfully | |
if [[ "$CONDA_DEFAULT_ENV" == "nf-villumina" ]]; then | |
echo "Conda environment nf-villumina activated!" | |
else | |
echo "Failed to activate conda environment nf-villumina." | |
exit 1 | |
fi | |
info "Exporting current Conda env for debugging" | |
conda env export | tee -a nf-villumina.conda-env.yml | |
info "Creating custom config for large centrifuge and kraken2 indexes" | |
cat > nf-villumina.big-index.config <<EOL | |
trace.overwrite = true | |
dag.overwrite = true | |
report.overwrite = true | |
timeline.overwrite = true | |
process { | |
withName:CENTRIFUGE { | |
errorStrategy = 'retry' | |
cpus = 1 | |
memory = 300.GB | |
time = '2d' | |
} | |
withName:KRAKEN2 { | |
errorStrategy = 'retry' | |
cpus = ${HALF_CPUS} | |
memory = ${NCPUS}.GB | |
time = '3d' | |
} | |
} | |
EOL | |
TAXIDLIST="$(date -I)-viruses-10239.taxidlist" | |
if [ -f $TAXIDLIST ]; then | |
info "Taxid list file '$TAXIDLIST' already exists. Skipping get_species_taxids.sh step..." | |
else | |
info "Getting latest viruses taxids from NCBI with 'get_species_taxids.sh' and outputting to $TAXIDLIST" | |
get_species_taxids.sh -t 10239 > $TAXIDLIST | |
fi | |
info "Pulling latest version of nf-villumina" | |
nextflow pull CFIA-NCFAD/nf-villumina | |
info "Running nf-villumina" | |
nextflow run CFIA-NCFAD/nf-villumina \ | |
-c nf-villumina.big-index.config \ | |
-resume \ | |
--reads "reads/*R{1,2}*.fastq.gz" \ | |
--blastn_db $BLASTN_NT_DB \ | |
--centrifuge_db $CENTRIFUGE_DB \ | |
--kraken2_db $KRAKEN2_DB \ | |
--blastn_taxids $TAXIDLIST \ | |
--max_cpus $NCPUS --max_memory "${MEM_TO_USE_GB} GB" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment