Created
September 3, 2020 21:37
-
-
Save skchronicles/98e35e273a70ff53c79efe9f30903f2a to your computer and use it in GitHub Desktop.
Pipeline to characterize Human Endogenous Retrovirus (HERV) expression
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -euo pipefail | |
function usage() { cat << EOF | |
HERVx: Pipeline to characterize Human Endogenous Retrovirus (HERV) expression | |
USAGE: | |
HERVx.sh [OPTIONS] -r1 SRR4235541_1.fastq -r2 SRR4235541_2.fastq -o outdir_path | |
SYNOPSIS: | |
HERVx calculates Human Endogenous Retrovirus (HERV) expression in paired-end | |
RNA-seq data. Characterization of HERV retrotranscriptome is difficult due to | |
uncertainty that arises in fragment assignment because of sequence similarity. | |
Telescope directly addresses uncertainty in fragment assignment by reassigning | |
ambiguously mapped fragments to the most probable source transcript as | |
determined within a Bayesian statistical model. | |
The HERVx pipeline runs cutadapt to remove adapter sequences and to perform | |
quality-trimming, bowtie2 to align reads against the Human reference genome (hg38), | |
SAMtools to convert from SAM to BAM format and to sort reads by name, and Telescope | |
to characterize Human Endogenous Retrovirus (HERV) expression. | |
Required Arguments: | |
-r1, --read-1 [Type: File] Input R1 FastQ file. | |
-r2, --read-2 [Type: File] Input R2 FastQ file. | |
-o, --outdir [Type: Path] Path to output directory. | |
OPTIONS: | |
-b, --basename [Type: Str] Base name of the sample. This is the name of the | |
sample without the PATH and the file extension. | |
WHERE: | |
file extension = MateInfo + .fastq + .gz [opt] | |
Given: /tmp/S25_WT_1.fastq /tmp/S25_WT_2.fastq | |
the base name would be "S25_rep1". This string is | |
used to deterministically resolve output filenames. | |
If not provided, the base name will be resolved | |
automatically by cleaning the provided "-r1" input | |
filename against a list of common extensions. | |
-t, --threads [Type: Int] Number of threads (Default = 2). | |
-g, --gtf [Type: File] Input annotation file in GTF format. | |
NOTE: User must choose from one of the following | |
GTFs in the container ("/opt2/refs/"): | |
1. HERV_rmsk.hg38.v2.transcripts.gtf (Default) | |
2. HERV_rmsk.hg38.v2.genes.gtf | |
3. L1Base.hg38.v1.transcripts.gtf | |
4. retro.hg38.v1.transcripts.gtf | |
-p, --prior [Type: Int] Prior on theta for Telescope (Default = 200000). | |
Equivalent to adding N non-unique reads. | |
NOTE: It is recommended to set this prior to a | |
large value. This increases the penalty for | |
non-unique reads and improves accuracy. | |
-m, --max-iter [Type: Int] Maximum number of iterations to test convergence | |
of the EM algorithm (Default = 200). | |
-h, --help Displays usage and help information | |
Example: | |
$ HERVx.sh -r1 /tmp/SRR4235541_1.fastq -r2 /tmp/SRR4235541_2.fastq -outdir ERV_hg38 | |
$ singularity exec -B $PWD:$PWD ccbr_telescope.sif HERVx -r1 S25_1.fastq -r2 S25_2.fastq -o tmp | |
NOTES: | |
+ DockerHub: https://hub.docker.com/repository/docker/nciccbr/ccbr_telescope | |
+ Reference files are located in /opt2/ in the container's filesystem. | |
+ Dockerfile to build this image is located in /opt2/Dockerfile. | |
EOF | |
} | |
# Functions | |
function err() { cat <<< "$@" 1>&2; } | |
function fatal() { cat <<< "$@" 1>&2; usage; exit 1; } | |
function parser() { | |
# Adds parsed command-line args to GLOBAL $Arguments associative array | |
# + KEYS = short_cli_flag ("r1", "r2", "o", ...) | |
# + VALUES = parsed_user_value ("WT_1.fastq" "WT_2.fastq", "/scratch", ...) | |
# @INPUT "$@" = user command-line arguments | |
# @CALLS check() to see if the user provided all the required arguments | |
while [[ $# -gt 0 ]]; do | |
key="$1" | |
case $key in | |
-h | --help) usage && exit 0;; | |
-r1 | --read-1) provided "$key" "${2-}"; Arguments["r1"]="$2"; shift; shift;; | |
-r2 | --read-2) provided "$key" "${2-}"; Arguments["r2"]="$2"; shift; shift;; | |
-o | --outdir) provided "$key" "${2-}"; Arguments["o"]="$2"; shift; shift;; | |
-t | --threads) provided "$key" "${2-}"; Arguments["t"]="$2"; shift; shift;; | |
-g | --gtf) provided "$key" "${2-}"; Arguments["g"]="$2"; shift; shift;; | |
-p | --prior) provided "$key" "${2-}"; Arguments["p"]="$2"; shift; shift;; | |
-m | --max-iter) provided "$key" "${2-}"; Arguments["m"]="$2"; shift; shift;; | |
-b | --basename) provided "$key" "${2-}"; Arguments["b"]="$2"; shift; shift;; | |
-* | --*) err "Error: Failed to parse unsupported argument: '${key}'."; usage && exit 1;; | |
*) err "Error: Failed to parse unrecognized argument: '${key}'. Do any of your inputs have spaces?"; usage && exit 1;; | |
esac | |
done | |
# Check for required args | |
check | |
} | |
function provided() { | |
# Checks to see if the argument's value exists | |
# @INPUT $1 = name of user provided argument | |
# @INPUT $2 = value of user provided argument | |
# @CALLS fatal() if value is empty string or NULL | |
if [[ -z "${2-}" ]]; then | |
fatal "Fatal: Failed to provide value to '${1}'!"; | |
fi | |
} | |
function clean(){ | |
# Finds the base name of the sample | |
# @INPUT $1 = From optional basename argument | |
# @RETURN $bname = cleaned base name (PATH and EXT removed) | |
local bname=${1-} | |
local exts=("_1.fastq" "_2.fastq" ".R1.fastq" ".R2.fastq" "_R1.fastq" "_R2.fastq") | |
if [[ -z "$bname" ]]; then | |
bname="${Arguments[r1]}" # Determine base name from R1 input | |
fi | |
# Remove PATH and .gz extension | |
bname=$(basename $bname | sed 's/.gz$//g') | |
# Clean remaining extensions (MateInfo + ) | |
for ext in "${exts[@]}"; do | |
if [[ $bname == *${ext} ]]; then | |
bname=$(echo "$bname" | sed "s@$ext\$@@") | |
break # only remove one extension | |
fi | |
done | |
echo "$bname" | |
} | |
function check(){ | |
# Checks to see if user provided required arguments | |
# @INPUTS $Arguments = Global Associative Array | |
# @CALLS fatal() if user did NOT provide all the $required args | |
# List of required arguments | |
local required=("r1" "r2" "o") | |
echo -e "Provided Required Inputs" | |
for arg in "${required[@]}"; do | |
value=${Arguments[${arg}]-}; [[ -z "${value}" ]] && { | |
fatal "Failed to provide all required args.. missing ${arg}" | |
} | |
echo -e "\t-${arg}\t${value}" | |
done | |
} | |
function trim(){ | |
# Step 1.) Cutadapt: remove adapter sequences and quality-trimming | |
# @INPUT $1 = basename (sample name without PATH and file ext) | |
# @INPUT $2 = Read1 FastQ file | |
# @INPUT $3 = Read2 FastQ file | |
# @INPUT $4 = Outdir (trim directory) | |
# @INPUT $5 = Threads | |
# @CALLS cutadapt | |
echo -e "\nStarted Trimming at $(date) with INPUTS: $@" | |
mkdir -p "${4}" | |
cutadapt -j "${5}" -b file:/opt2/refs/trimmonatic_TruSeqv3_adapters.fa -B file:/opt2/refs/trimmonatic_TruSeqv3_adapters.fa \ | |
--nextseq-trim=2 --trim-n -n 5 -O 5 -q 10,10 -m 35:35 \ | |
-o ${4}/${1}_trimmed.R1.fastq.gz -p ${4}/${1}_trimmed.R2.fastq.gz \ | |
"$2" "$3" 1> ${4}/${1}_cutadapt.log | |
} | |
function align(){ | |
# Step 2.) Bowtie2: align against reference genome and sorts BAM file | |
# @INPUT $1 = basename (sample name without PATH and file ext) | |
# @INPUT $2 = Trimmed Read1 FastQ file | |
# @INPUT $3 = Trimmed Read2 FastQ file | |
# @INPUT $4 = Outdir (bam directory) | |
# @INPUT $5 = Threads | |
# @CALLS bowtie2, samtools | |
echo -e "\nStarting Alignment at $(date) with INPUTS: $@" | |
mkdir -p "${4}" | |
bowtie2 -p "${5}" -k 100 --very-sensitive-local --score-min "L,0,1.6" \ | |
-x /opt2/bowtie2/hg38 -1 "${2}" -2 "${3}" \ | |
-S ${4}/${1}_bowtie.sam | |
# Step 3.) SAMtools: covert SAM to BAM and sort by name | |
samtools view -@ "${5}" -b ${4}/${1}_bowtie.sam > ${4}/${1}_bowtie.bam | |
samtools sort -@ "${5}" -n ${4}/${1}_bowtie.bam -o ${4}/${1}_bowtie.sorted.bam | |
rm ${4}/${1}_bowtie.bam ${4}/${1}_bowtie.sam | |
} | |
function hervx(){ | |
# Step 3.) Telescope: characterizes of Human Endogenous Retrovirus (HERV) expression | |
# @INPUT $1 = basename (sample name without PATH and file ext) | |
# @INPUT $2 = BAM file (sorted by name) | |
# @INPUT $3 = Outdir (hervx directory) | |
# @INPUT $4 = GTF file | |
# @INPUT $5 = theta-prior (recommended value = 200000) | |
# @INPUT $6 = Max EM iterations (recommended value = 200) | |
# @CALLS telescope | |
echo -e "\nStarting Telescope at $(date) with INPUTS: $@" | |
mkdir -p "${3}" | |
telescope assign --theta_prior "${5}" --max_iter "${6}" \ | |
--outdir "$3" "${2}" "/opt2/refs/${4}" | |
} | |
function main(){ | |
# Parses args and runs trimming, aligment, and quantifies HERV expression | |
# @INPUT "$@" = command-line arguments | |
# @CALLS parser(), trim(), align(), hervx() | |
echo -e "\nHERVx (version 0.0.1)\t$(date)" | |
if [ $# -eq 0 ]; then usage; exit 1; fi | |
# Associative array to store parsed args | |
declare -Ag Arguments | |
# Parses user provided command-line arguments | |
parser "$@" | |
# Required Arguments | |
base=$(clean "${Arguments[b]-}") | |
read1="${Arguments[r1]}"; read2="${Arguments[r2]}" | |
outdir="${Arguments[o]%/}" | |
# Optional Arguments | |
threads="${Arguments[t]-2}" # default is 2 threads | |
gtf="${Arguments[g]-HERV_rmsk.hg38.v2.transcripts.gtf}" # default = HERV_rmsk.hg38.v2.transcripts.gtf | |
prior="${Arguments[p]-200000}" # default = 200000 | |
iter="${Arguments[m]-200}" # default = 200 | |
# HERVx Pipeline | |
trim "$base" "$read1" "$read2" "${outdir}/trim" "$threads" | |
align "$base" "${outdir}/trim/${base}_trimmed.R1.fastq.gz" "${outdir}/trim/${base}_trimmed.R2.fastq.gz" "${outdir}/bams" "$threads" | |
hervx "$base" "${outdir}/bams/${base}_bowtie.sorted.bam" "${outdir}/hervx/${base}" "${gtf}" "${prior}" "${iter}" | |
} | |
# Main: check usage, parse args, and run HERVx pipeline | |
main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment