Skip to content

Instantly share code, notes, and snippets.

@mmterpstra
Last active December 1, 2025 11:00
Show Gist options
  • Select an option

  • Save mmterpstra/4f096a088223d598ad2caec0a7c00703 to your computer and use it in GitHub Desktop.

Select an option

Save mmterpstra/4f096a088223d598ad2caec0a7c00703 to your computer and use it in GitHub Desktop.
Downsample a bam file

This gist stores downsample scripts for downsampling either bams or fastqs using the gatk/picard toolkit.

  • SimpleDownSample.sh for downsampling bams without realignment
    • This includes downsampling, reverting remarking duplicates and remarking duplicates for downstream variant calling
  • PicardIlluminaBasespaceDownSample.sh for downsampling fastqs for use in illumina basespace application.
    • This assumes Illumina basespace compatible input files and returns 10% and 50% downsampled fastq files compatible with the illumina basespace app. The ds01* output files are the 10% downsampled and the ds05* are the 50% downsampled files.
#!/usr/bin/bash
ml purge
set -e -o pipefail
cd /path/to/project_downsample/
mkdir -p tmp/
ml picard/3.3.0-Java-17
for i in [^d]*_R1_*.fastq.gz; do
READ1="$i"
READ2=$(echo $i | perl -wpe 's/_R1_0/_R2_0/g')
TMPREAD1=./tmp/$(echo $READ1| perl -wpe 's/.*(S\d\d\d_L\d\d\d_R[12]_\d\d\d.fastq.gz)/$1/g')
TMPREAD2=./tmp/$(echo $READ2| perl -wpe 's/.*(S\d\d\d_L\d\d\d_R[12]_\d\d\d.fastq.gz)/$1/g')
TMPREADDOWNSAMPLE05_1=./tmp/ds05_$(echo $READ1| perl -wpe 's/.*(S\d\d\d_L\d\d\d_R[12]_\d\d\d.fastq.gz)/$1/g')
TMPREADDOWNSAMPLE05_2=./tmp/ds05_$(echo $READ2| perl -wpe 's/.*(S\d\d\d_L\d\d\d_R[12]_\d\d\d.fastq.gz)/$1/g')
TMPREADDOWNSAMPLE01_1=./tmp/ds01_$(echo $READ1| perl -wpe 's/.*(S\d\d\d_L\d\d\d_R[12]_\d\d\d.fastq.gz)/$1/g')
TMPREADDOWNSAMPLE01_2=./tmp/ds01_$(echo $READ2| perl -wpe 's/.*(S\d\d\d_L\d\d\d_R[12]_\d\d\d.fastq.gz)/$1/g')
READDOWNSAMPLE05_1=./ds05_$READ1
READDOWNSAMPLE05_2=./ds05_$READ2
READDOWNSAMPLE01_1=./ds01_$READ1
READDOWNSAMPLE01_2=./ds01_$READ2
if [ ! -e $TMPREAD1 ] ; then
ln --symbolic -T $(realpath $READ1) $TMPREAD1
ln --symbolic -T $(realpath $READ2) $TMPREAD2
fi
# O=./tmp/$(basename $TMPREAD1 .fastq.gz ).bam \
java -Xmx2g -jar $EBROOTPICARD/picard.jar FastqToSam \
-F1 $TMPREAD1 \
-F2 $TMPREAD1 \
--COMPRESSION_LEVEL 0 \
-O /dev/stdout \
-SM "$(echo $READ1| perl -wpe 's/.*(S\d\d\d)_L\d\d\d_R[12]_\d\d\d.fastq.gz/$1/g' )" \
-RG rg001 | \
java -Xmx2g -jar $EBROOTPICARD/picard.jar DownsampleSam \
-I /dev/stdin \
-O /dev/stdout \
--STRATEGY Chained \
-P 0.5 \
--ACCURACY 0.0001 \
--METRICS_FILE ./tmp/$(basename $TMPREAD1 .fastq.gz ).downsample_metrics \
--COMPRESSION_LEVEL 0 | \
java -Xmx2g -jar $EBROOTPICARD/picard.jar SamToFastq \
-I /dev/stdin \
-FASTQ $TMPREADDOWNSAMPLE05_1 \
-SECOND_END_FASTQ $TMPREADDOWNSAMPLE05_2
mv $TMPREADDOWNSAMPLE05_1 $READDOWNSAMPLE05_1
mv $TMPREADDOWNSAMPLE05_2 $READDOWNSAMPLE05_2
java -Xmx2g -jar $EBROOTPICARD/picard.jar FastqToSam \
-F1 $TMPREAD1 \
-F2 $TMPREAD1 \
--COMPRESSION_LEVEL 0 \
-O /dev/stdout \
-SM "$(echo $READ1| perl -wpe 's/.*(S\d\d\d)_L\d\d\d_R[12]_\d\d\d.fastq.gz/$1/g' )" \
-RG rg001 | \
java -Xmx2g -jar $EBROOTPICARD/picard.jar DownsampleSam \
-I /dev/stdin \
-O /dev/stdout \
--STRATEGY Chained \
-P 0.1 \
--ACCURACY 0.0001 \
--METRICS_FILE ./tmp/$(basename $TMPREAD1 .fastq.gz ).downsample_metrics \
--COMPRESSION_LEVEL 0 | \
java -Xmx2g -jar $EBROOTPICARD/picard.jar SamToFastq \
-I /dev/stdin \
-FASTQ $TMPREADDOWNSAMPLE01_1 \
-SECOND_END_FASTQ $TMPREADDOWNSAMPLE01_2
mv $TMPREADDOWNSAMPLE01_1 $READDOWNSAMPLE01_1
mv $TMPREADDOWNSAMPLE01_2 $READDOWNSAMPLE01_2
FIXHEADER1=" $({ gzip -dc $TMPREAD1 ||:; } |head -n 1| cut -f 2 -d \ )"
FIXHEADER2=" $({ gzip -dc $TMPREAD2 ||:; } |head -n 1| cut -f 2 -d \ )"
if [ ! -e $TMPREADDOWNSAMPLE05_1 ]; then
mv $READDOWNSAMPLE05_1 $TMPREADDOWNSAMPLE05_1
fi
if [ ! -e $TMPREADDOWNSAMPLE05_2 ]; then
mv $READDOWNSAMPLE05_2 $TMPREADDOWNSAMPLE05_2
fi
if [ ! -e $TMPREADDOWNSAMPLE01_1 ]; then
mv $READDOWNSAMPLE01_1 $TMPREADDOWNSAMPLE01_1
fi
if [ ! -e $TMPREADDOWNSAMPLE01_2 ]; then
mv $READDOWNSAMPLE01_2 $TMPREADDOWNSAMPLE01_2
fi
gzip -dc "$TMPREADDOWNSAMPLE05_1" | perl -wpe 's/\/[12]$/'"$FIXHEADER1"'/g if $.%4==1;' | gzip -c > "$TMPREADDOWNSAMPLE05_1.fixed.fastq.gz"
gzip -dc "$TMPREADDOWNSAMPLE05_2" | perl -wpe 's/\/[12]$/'"$FIXHEADER2"'/g if $.%4==1;' | gzip -c > "$TMPREADDOWNSAMPLE05_2.fixed.fastq.gz"
gzip -dc "$TMPREADDOWNSAMPLE01_1" | perl -wpe 's/\/[12]$/'"$FIXHEADER1"'/g if $.%4==1;' | gzip -c > "$TMPREADDOWNSAMPLE01_1.fixed.fastq.gz"
gzip -dc "$TMPREADDOWNSAMPLE01_2" | perl -wpe 's/\/[12]$/'"$FIXHEADER2"'/g if $.%4==1;' | gzip -c > "$TMPREADDOWNSAMPLE01_2.fixed.fastq.gz"
#fix nameing retrostpectively
READDOWNSAMPLE05_1=./ds05$READ1
READDOWNSAMPLE05_2=./ds05$READ2
READDOWNSAMPLE01_1=./ds01$READ1
READDOWNSAMPLE01_2=./ds01$READ2
mv $TMPREADDOWNSAMPLE01_1.fixed.fastq.gz $READDOWNSAMPLE01_1
mv $TMPREADDOWNSAMPLE01_2.fixed.fastq.gz $READDOWNSAMPLE01_2
mv $TMPREADDOWNSAMPLE05_1.fixed.fastq.gz $READDOWNSAMPLE05_1
mv $TMPREADDOWNSAMPLE05_2.fixed.fastq.gz $READDOWNSAMPLE05_2
done
#intro
#This shows how to downsample a bam without returning to fastq state of the file...
#pros: Fast
#cons: Might have alignment artifacts/info from the bigger subpool (like better indel alignments). Worse then completely stripping alignment info and aligning the reads again.
#how to use
#create a config and edit the downsample fractions as seen below and run the samples or just remove the leading part and create one yourselfs.
#parallelisation is done in a lazy way and shoud be fixed/checked.
#p values should be doubled in case of paired reads from multiqc report.
cat << EOF > config.sh
i=/path/to/bam/sample.bam;p=0.69
i=/path/to/bam/sample2.bam;p=0.42
EOF
threads=3
set -ex
for line in $(cat config.sh) ; do
(
export TMP_DIR="/scratch/$USER/projects/"
mkdir -p $TMP_DIR
. <(echo "$line")
mkdir -p $(dirname $i)/../downsample/
ml GATK/4.2.6.1-GCCcore-11.2.0-Java-11
gatk --java-options "-Xmx4G -Djava.io.tmpdir=$TMP_DIR" RevertSam I=$i \
REMOVE_DUPLICATE_INFORMATION=true \
RESTORE_ORIGINAL_QUALITIES=false \
REMOVE_ALIGNMENT_INFORMATION=false \
RESTORE_HARDCLIPS=false \
ATTRIBUTE_TO_CLEAR=null \
O=$(dirname $i)/../downsample/$(basename $i .bam).revertsam.bam
gatk --java-options "-Xmx4G -Djava.io.tmpdir=$TMP_DIR" DownsampleSam \
--INPUT $(dirname $i)/../downsample/$(basename $i .bam).revertsam.bam \
--OUTPUT $(dirname $i)/../downsample/$(basename $i .bam).downsample.bam \
--PROBABILITY $p \
--METRICS_FILE $(dirname $i)/../downsample/$(basename $i .bam ).downsample.metrics.tsv
gatk --java-options "-Xmx4G -Djava.io.tmpdir=$TMP_DIR" MarkDuplicates \
--INPUT $(dirname $i)/../downsample/$(basename $i .bam).downsample.bam \
--OUTPUT $(dirname $i)/../downsample/$(basename $i .bam).markduplicates.bam \
--METRICS_FILE $(dirname $i)/../downsample/$(basename $i .bam).markduplicates.metrics.tsv
gatk --java-options "-Xmx4G -Djava.io.tmpdir=$TMP_DIR" SortSam \
--INPUT $(dirname $i)/../downsample/$(basename $i .bam).markduplicates.bam \
--OUTPUT $(dirname $i)/../downsample/$(basename $i .bam).sorted.bam \
--SORT_ORDER coordinate \
--CREATE_INDEX true
)&
while [ $(jobs | wc -l ) -ge $threads ]; do echo "waiting"; jobs; sleep 30s ; done
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment