Created
September 17, 2024 03:28
-
-
Save tomsing1/5e6d11bd6509cba8c3f2408b903a85a3 to your computer and use it in GitHub Desktop.
A script that retrieves BAM files from S3 and quantifies exon-level expression with featurecounts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -e | |
set -o nounset | |
# This script uses a GTF file (e.g. generated by Stringtie) and | |
# 1. retrieves BAM files for individual samples from AWS S3 | |
# 2. quantifies the exons defined in the GTF file with featureCounts, | |
# e.g. for differential splicing analysis with limma::diffSplice | |
# | |
# Alignments are expected to be reverse-stranded and paired-end. | |
declare -r SAMPLES=(SAM1 SAM2 SAM3 \ | |
SAM4 SAM5 SAM6) # BAM file prefixes | |
declare -rx GTF_FILE="references/gene_annotations.gtf.gz" | |
declare -rx PREFIX="my_project" | |
declare -rx S3_ROOT="s3://my/bucket/and/prefix" | |
declare -rx S3_SOURCE="${S3_ROOT}/prefix/with/bam/files" | |
declare -rx S3_DEST="${S3_ROOT}/prefix/for/output/files" | |
declare -rx TMPDIR="$(mktemp -d tmp/XXXX)" | |
declare -rx CORES=$(nproc) | |
if which featureCounts >/dev/null; then | |
echo "Found featureCounts!" | |
else | |
echo "Can't find featureCounts, is it installed?" | |
exit 1 | |
fi | |
if ! test -f ${GTF_FILE}; then | |
echo "Cannot find file ${GTF_FILE}." | |
exit 1 | |
fi | |
decompress_gtf () { | |
gzip -cd ${GTF_FILE} > ${TMPDIR}/gene_annotation.gtf #| \ | |
#grep -v -P "\t\.\t.\t" \ | |
} | |
retrieve_bam () { | |
local SAMPLE=${1} | |
aws s3 sync \ | |
--exclude "*" \ | |
--include "${SAMPLE}*.bam" \ | |
--include "${SAMPLE}*.bam.bai" \ | |
"${S3_SOURCE}" \ | |
"${TMPDIR}/bam" | |
} | |
count () { | |
featureCounts \ | |
-F GTF \ | |
-a ${TMPDIR}/gene_annotation.gtf \ | |
-o ${PREFIX}_counts.txt \ | |
-T ${CORES} \ | |
-t exon \ | |
-f \ | |
-O \ | |
-s 2 \ | |
-p \ | |
-C \ | |
--countReadPairs \ | |
${TMPDIR}/bam/*.bam | |
} | |
decompress_gtf | |
for SAMPLE in ${SAMPLES[@]}; do retrieve_bam ${SAMPLE}; done | |
count | |
# cleanup | |
rm -rf "${TMPDIR}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment