peterk87 · May 23, 2024 21:25
diff --git a/nf-villumina.sh b/nf-villumina.sh
 #!/bin/bash

 # Function to handle Ctrl+C
 handle_interrupt() {
    echo -e "\n\033[1;31mERROR:\033[1m Script interrupted by user (Ctrl+C)\033[0m"
    exit 1
 }

 # Set up the trap to catch SIGINT (Ctrl+C)
 trap handle_interrupt SIGINT

 # Define default values
 TOTAL_CPUS=$(nproc)
 DEFAULT_KRAKEN2_DB="/opt/DB/kraken2/nt_20231129"
 DEFAULT_CENTRIFUGE_DB="/opt/DB/centrifuge/nt-2020-02-04/nt"
 DEFAULT_BLASTN_NT_DB="/opt/DB/blast/nt/nt"

 # Get the amount of free memory in KB, convert to GB
 FREE_MEM_KB=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
 FREE_MEM_GB=$(echo "scale=2; $FREE_MEM_KB / 1024 / 1024" | bc)

 # Function to print usage information
 usage() {
    echo "Usage: $0 [-h] [-n NCPUS] [-m MEM_TO_USE_GB] [BLASTN_NT_DB] [KRAKEN2_DB] [CENTRIFUGE_DB]"
    echo "Options:"
    echo "  -h                   Display this help message"
    echo "  -n NCPUS             Number of CPUs to use (default: total CPUs; $TOTAL_CPUS)"
    echo "  -m MEM_TO_USE_GB     Amount of memory to use in GB (default: free memory; $FREE_MEM_GB GB)"
    echo "Arguments:"
    echo "  BLASTN_NT_DB         Path to BLASTN NT database (default: $DEFAULT_BLASTN_NT_DB)"
    echo "  KRAKEN2_DB           Path to Kraken2 database (default: $DEFAULT_KRAKEN2_DB)"
    echo "  CENTRIFUGE_DB        Path to Centrifuge database (default: $DEFAULT_CENTRIFUGE_DB)"
    exit 0
 }

 # Parse command-line options
 while getopts ":hn:m:" opt; do
    case ${opt} in
        h)
            usage
            ;;
        n)
            NCPUS=$OPTARG
            ;;
        m)
            MEM_TO_USE_GB=$OPTARG
            ;;
        \?)
            echo "Invalid option: -$OPTARG" >&2
            usage
            ;;
        :)
            echo "Option -$OPTARG requires an argument." >&2
            usage
            ;;
    esac
 done
 shift $((OPTIND -1))


 NCPUS=${NCPUS:-$TOTAL_CPUS}
 HALF_CPUS=$((NCPUS / 2))
 MEM_TO_USE_GB=${MEM_TO_USE_GB:-$FREE_MEM_GB}

 # Set default database paths
 BLASTN_NT_DB=${1:-$DEFAULT_BLASTN_NT_DB}
 KRAKEN2_DB=${2:-$DEFAULT_KRAKEN2_DB}
 CENTRIFUGE_DB=${3:-$DEFAULT_CENTRIFUGE_DB}


 error() {
  echo -e "$(date -Is)  \033[1;31mERROR: \033[0m\033[1m$1\033[0m"
 }

 info() {
  echo -e "$(date -Is)  \033[1;32mINFO: \033[0m\033[1m$1\033[0m"
 }

 # Check if the reads directory exists and contains at least two .fastq.gz files
 if [ ! -d "reads/" ] || [ $(ls reads/*.fastq.gz 2>/dev/null | wc -l) -lt 2 ]; then 
    error "'reads/' directory does not exist or contains fewer than two .fastq.gz files!"
    exit 1
 fi

 info "Using ${NCPUS} CPUs and ${MEM_TO_USE_GB} GB for nf-villumina analysis"

 info "Creating nf-villumina.environment.yml"

 cat > nf-villumina.environment.yml <<EOL
 name: nf-villumina
 channels:
  - conda-forge
  - bioconda
  - defaults
 dependencies:
  - bbmap
  - blast
  - centrifuge-core
  - curl
  - fastp
  - fastqc
  - kraken2
  - mash
  - megahit
  - pbgzip
  - samtools
  - seqtk
  - shovill
  - spades
  - unicycler
  - openjdk=21
  - python
  - pip
  - pip:
    - filter_classified_reads
    - biopython
    - click
    - attrs
    - numpy
    - pandas
 EOL

 info "Creating nf-villumina Conda env"

 conda env create -f nf-villumina.environment.yml

 # Source the conda.sh script to use conda in the script
 source "$(conda info --base)/etc/profile.d/conda.sh"

 # Activate the conda environment
 conda activate nf-villumina

 # Check if the environment was activated successfully
 if [[ "$CONDA_DEFAULT_ENV" == "nf-villumina" ]]; then
    echo "Conda environment nf-villumina activated!"
 else
    echo "Failed to activate conda environment nf-villumina."
    exit 1
 fi

 info "Exporting current Conda env for debugging"
 conda env export | tee -a nf-villumina.conda-env.yml

 info "Creating custom config for large centrifuge and kraken2 indexes"

 cat > nf-villumina.big-index.config <<EOL
 trace.overwrite = true
 dag.overwrite = true
 report.overwrite = true
 timeline.overwrite = true

 process {
    withName:CENTRIFUGE {
        errorStrategy = 'retry'
        cpus = 1
        memory = 300.GB
        time = '2d'
    }
    withName:KRAKEN2 {
        errorStrategy = 'retry'
        cpus = ${HALF_CPUS}
        memory = ${NCPUS}.GB
        time = '3d'
    }
 }
 EOL


 TAXIDLIST="$(date -I)-viruses-10239.taxidlist"
 if [ -f $TAXIDLIST ]; then 
  info "Taxid list file '$TAXIDLIST' already exists. Skipping get_species_taxids.sh step..."
 else
  info "Getting latest viruses taxids from NCBI with 'get_species_taxids.sh' and outputting to $TAXIDLIST"
  get_species_taxids.sh -t 10239 > $TAXIDLIST
 fi

 info "Pulling latest version of nf-villumina"

 nextflow pull CFIA-NCFAD/nf-villumina

 info "Running nf-villumina"

 nextflow run CFIA-NCFAD/nf-villumina \
  -c nf-villumina.big-index.config \
  -resume \
  --reads "reads/*R{1,2}*.fastq.gz" \
  --blastn_db $BLASTN_NT_DB \
  --centrifuge_db $CENTRIFUGE_DB \
  --kraken2_db $KRAKEN2_DB \
  --blastn_taxids $TAXIDLIST \
  --max_cpus $NCPUS --max_memory "${MEM_TO_USE_GB} GB"
	#!/bin/bash

	# Function to handle Ctrl+C
	handle_interrupt() {
	echo -e "\n\033[1;31mERROR:\033[1m Script interrupted by user (Ctrl+C)\033[0m"
	exit 1
	}

	# Set up the trap to catch SIGINT (Ctrl+C)
	trap handle_interrupt SIGINT

	# Define default values
	TOTAL_CPUS=$(nproc)
	DEFAULT_KRAKEN2_DB="/opt/DB/kraken2/nt_20231129"
	DEFAULT_CENTRIFUGE_DB="/opt/DB/centrifuge/nt-2020-02-04/nt"
	DEFAULT_BLASTN_NT_DB="/opt/DB/blast/nt/nt"

	# Get the amount of free memory in KB, convert to GB
	FREE_MEM_KB=$(awk '/MemAvailable/ {print $2}' /proc/meminfo)
	FREE_MEM_GB=$(echo "scale=2; $FREE_MEM_KB / 1024 / 1024" \| bc)

	# Function to print usage information
	usage() {
	echo "Usage: $0 [-h] [-n NCPUS] [-m MEM_TO_USE_GB] [BLASTN_NT_DB] [KRAKEN2_DB] [CENTRIFUGE_DB]"
	echo "Options:"
	echo " -h Display this help message"
	echo " -n NCPUS Number of CPUs to use (default: total CPUs; $TOTAL_CPUS)"
	echo " -m MEM_TO_USE_GB Amount of memory to use in GB (default: free memory; $FREE_MEM_GB GB)"
	echo "Arguments:"
	echo " BLASTN_NT_DB Path to BLASTN NT database (default: $DEFAULT_BLASTN_NT_DB)"
	echo " KRAKEN2_DB Path to Kraken2 database (default: $DEFAULT_KRAKEN2_DB)"
	echo " CENTRIFUGE_DB Path to Centrifuge database (default: $DEFAULT_CENTRIFUGE_DB)"
	exit 0
	}

	# Parse command-line options
	while getopts ":hn:m:" opt; do
	case ${opt} in
	h)
	usage
	;;
	n)
	NCPUS=$OPTARG
	;;
	m)
	MEM_TO_USE_GB=$OPTARG
	;;
	\?)
	echo "Invalid option: -$OPTARG" >&2
	usage
	;;
	:)
	echo "Option -$OPTARG requires an argument." >&2
	usage
	;;
	esac
	done
	shift $((OPTIND -1))


	NCPUS=${NCPUS:-$TOTAL_CPUS}
	HALF_CPUS=$((NCPUS / 2))
	MEM_TO_USE_GB=${MEM_TO_USE_GB:-$FREE_MEM_GB}

	# Set default database paths
	BLASTN_NT_DB=${1:-$DEFAULT_BLASTN_NT_DB}
	KRAKEN2_DB=${2:-$DEFAULT_KRAKEN2_DB}
	CENTRIFUGE_DB=${3:-$DEFAULT_CENTRIFUGE_DB}


	error() {
	echo -e "$(date -Is) \033[1;31mERROR: \033[0m\033[1m$1\033[0m"
	}

	info() {
	echo -e "$(date -Is) \033[1;32mINFO: \033[0m\033[1m$1\033[0m"
	}

	# Check if the reads directory exists and contains at least two .fastq.gz files
	if [ ! -d "reads/" ] \|\| [ $(ls reads/*.fastq.gz 2>/dev/null \| wc -l) -lt 2 ]; then
	error "'reads/' directory does not exist or contains fewer than two .fastq.gz files!"
	exit 1
	fi

	info "Using ${NCPUS} CPUs and ${MEM_TO_USE_GB} GB for nf-villumina analysis"

	info "Creating nf-villumina.environment.yml"

	cat > nf-villumina.environment.yml <<EOL
	name: nf-villumina
	channels:
	- conda-forge
	- bioconda
	- defaults
	dependencies:
	- bbmap
	- blast
	- centrifuge-core
	- curl
	- fastp
	- fastqc
	- kraken2
	- mash
	- megahit
	- pbgzip
	- samtools
	- seqtk
	- shovill
	- spades
	- unicycler
	- openjdk=21
	- python
	- pip
	- pip:
	- filter_classified_reads
	- biopython
	- click
	- attrs
	- numpy
	- pandas
	EOL

	info "Creating nf-villumina Conda env"

	conda env create -f nf-villumina.environment.yml

	# Source the conda.sh script to use conda in the script
	source "$(conda info --base)/etc/profile.d/conda.sh"

	# Activate the conda environment
	conda activate nf-villumina

	# Check if the environment was activated successfully
	if [[ "$CONDA_DEFAULT_ENV" == "nf-villumina" ]]; then
	echo "Conda environment nf-villumina activated!"
	else
	echo "Failed to activate conda environment nf-villumina."
	exit 1
	fi

	info "Exporting current Conda env for debugging"
	conda env export \| tee -a nf-villumina.conda-env.yml

	info "Creating custom config for large centrifuge and kraken2 indexes"

	cat > nf-villumina.big-index.config <<EOL
	trace.overwrite = true
	dag.overwrite = true
	report.overwrite = true
	timeline.overwrite = true

	process {
	withName:CENTRIFUGE {
	errorStrategy = 'retry'
	cpus = 1
	memory = 300.GB
	time = '2d'
	}
	withName:KRAKEN2 {
	errorStrategy = 'retry'
	cpus = ${HALF_CPUS}
	memory = ${NCPUS}.GB
	time = '3d'
	}
	}
	EOL


	TAXIDLIST="$(date -I)-viruses-10239.taxidlist"
	if [ -f $TAXIDLIST ]; then
	info "Taxid list file '$TAXIDLIST' already exists. Skipping get_species_taxids.sh step..."
	else
	info "Getting latest viruses taxids from NCBI with 'get_species_taxids.sh' and outputting to $TAXIDLIST"
	get_species_taxids.sh -t 10239 > $TAXIDLIST
	fi

	info "Pulling latest version of nf-villumina"

	nextflow pull CFIA-NCFAD/nf-villumina

	info "Running nf-villumina"

	nextflow run CFIA-NCFAD/nf-villumina \
	-c nf-villumina.big-index.config \
	-resume \
	--reads "reads/R{1,2}.fastq.gz" \
	--blastn_db $BLASTN_NT_DB \
	--centrifuge_db $CENTRIFUGE_DB \
	--kraken2_db $KRAKEN2_DB \
	--blastn_taxids $TAXIDLIST \
	--max_cpus $NCPUS --max_memory "${MEM_TO_USE_GB} GB"
No results found