Last active
July 25, 2023 13:00
-
-
Save mmterpstra/4f096a088223d598ad2caec0a7c00703 to your computer and use it in GitHub Desktop.
Downsample a bam file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#intro | |
#This shows how to downsample a bam without returning to fastq state of the file... | |
#pros: Fast | |
#cons: Might have alignment artifacts/info from the bigger subpool (like better indel alignments). Worse then completely stripping alignment info and aligning the reads again. | |
#how to use | |
#create a config and edit the downsample fractions as seen below and run the samples or just remove the leading part and create one yourselfs. | |
#parallelisation is done in a lazy way and shoud be fixed/checked. | |
#p values should be doubled in case of paired reads from multiqc report. | |
cat << EOF > config.sh | |
i=/path/to/bam/sample.bam;p=0.69 | |
i=/path/to/bam/sample2.bam;p=0.42 | |
EOF | |
threads=3 | |
set -ex | |
for line in $(cat config.sh) ; do | |
( | |
export TMP_DIR="/scratch/$USER/projects/" | |
mkdir -p $TMP_DIR | |
. <(echo "$line") | |
mkdir -p $(dirname $i)/../downsample/ | |
ml GATK/4.2.6.1-GCCcore-11.2.0-Java-11 | |
gatk --java-options "-Xmx4G -Djava.io.tmpdir=$TMP_DIR" RevertSam I=$i \ | |
REMOVE_DUPLICATE_INFORMATION=true \ | |
RESTORE_ORIGINAL_QUALITIES=false \ | |
REMOVE_ALIGNMENT_INFORMATION=false \ | |
RESTORE_HARDCLIPS=false \ | |
ATTRIBUTE_TO_CLEAR=null \ | |
O=$(dirname $i)/../downsample/$(basename $i .bam).revertsam.bam | |
gatk --java-options "-Xmx4G -Djava.io.tmpdir=$TMP_DIR" DownsampleSam \ | |
--INPUT $(dirname $i)/../downsample/$(basename $i .bam).revertsam.bam \ | |
--OUTPUT $(dirname $i)/../downsample/$(basename $i .bam).downsample.bam \ | |
--PROBABILITY $p \ | |
--METRICS_FILE $(dirname $i)/../downsample/$(basename $i .bam ).downsample.metrics.tsv | |
gatk --java-options "-Xmx4G -Djava.io.tmpdir=$TMP_DIR" MarkDuplicates \ | |
--INPUT $(dirname $i)/../downsample/$(basename $i .bam).downsample.bam \ | |
--OUTPUT $(dirname $i)/../downsample/$(basename $i .bam).markduplicates.bam \ | |
--METRICS_FILE $(dirname $i)/../downsample/$(basename $i .bam).markduplicates.metrics.tsv | |
gatk --java-options "-Xmx4G -Djava.io.tmpdir=$TMP_DIR" SortSam \ | |
--INPUT $(dirname $i)/../downsample/$(basename $i .bam).markduplicates.bam \ | |
--OUTPUT $(dirname $i)/../downsample/$(basename $i .bam).sorted.bam \ | |
--SORT_ORDER coordinate \ | |
--CREATE_INDEX true | |
)& | |
while [ $(jobs | wc -l ) -ge $threads ]; do echo "waiting"; jobs; sleep 30s ; done | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment