Last active
February 17, 2025 09:10
-
-
Save ckandoth/9416c6e76ca9ca3f0e5716762e9e0b7a to your computer and use it in GitHub Desktop.
Create test data for CI/CD of a FASTQ to gVCF bioinformatics pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# GOAL: Create test data for CI/CD of a germline variant calling pipeline (FASTQ to gVCF) | |
# Steps below were performed on Ubuntu 24.04, but should be reproducible on any Linux distro | |
# Download and install micromamba under your home directory and logout: | |
"${SHELL}" <(curl -L micro.mamba.pm/install.sh) | |
# Log back in to add micromamba to your $PATH, and then use it to install the tools we need: | |
micromamba create -y -n bio -c conda-forge -c bioconda htslib==1.21 samtools==1.21 bcftools==1.21 picard-slim==3.3.0 | |
micromamba activate bio | |
# bs (BaseSpace CLI) cannot be installed using conda, so let's do it manually: | |
curl -L https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs -o $HOME/micromamba/envs/bio/bin/bs | |
chmod +x $HOME/micromamba/envs/bio/bin/bs | |
# Visit https://basespace.illumina.com and login. Create a free account if needed. Then visit https://basespace.illumina.com/s/htWXpgEKrRu6 | |
# and click "Accept" to add the free demo WGS data to your BaseSpace account. Use "bs auth" to login before using the "bs" commands below. | |
# Download WGS CRAMs and gVCFs from Dragen 4.3.13 on the GIAB Ashkenazi Jewish Trio (HG002, HG003, HG004): | |
mkdir data | |
bs list dataset --terse --project-name "TruSeq-PCRfree-HG001-HG007-10B-2-v13" --is-type "illumina.dragen.complete.v0.4.3" --filter-term "TSPF-HG002-10B-2-v13-Rep1|TSPF-HG003-10B-2-v13-Rep1|TSPF-HG004-10B-2-v13-Rep1" |\ | |
xargs -L1 bs contents dataset --terse --extension cram,crai,gvcf.gz,gvcf.gz.tbi --id |\ | |
xargs -L1 bs download file --no-metadata -o data --id | |
# Download the hg38 reference genome and create a subset with chr21 plus a random unplaced contig: | |
mkdir ref | |
wget -P ref https://ilmn-dragen-giab-samples.s3.amazonaws.com/FASTA/hg38.fa | |
samtools faidx -@8 ref/hg38.fa | |
samtools faidx -@8 -o ref/hg38_chr21.fa ref/hg38.fa chr21 chrUn_GL000220v1 | |
samtools faidx -@8 ref/hg38_chr21.fa | |
rm -f ref/hg38.fa ref/hg38.fa.fai | |
# Make smaller gVCFs limited to chr21 and the unplaced contig selected above: | |
mkdir son dad mom | |
bcftools view -r chr21,chrUn_GL000220v1 -O z -l 6 -o son/son_wgs_chr21.gvcf.gz data/TSPF-HG002-10B-2-v13-Rep1.hard-filtered.gvcf.gz | |
bcftools view -r chr21,chrUn_GL000220v1 -O z -l 6 -o dad/dad_wgs_chr21.gvcf.gz data/TSPF-HG003-10B-2-v13-Rep1.hard-filtered.gvcf.gz | |
bcftools view -r chr21,chrUn_GL000220v1 -O z -l 6 -o mom/mom_wgs_chr21.gvcf.gz data/TSPF-HG004-10B-2-v13-Rep1.hard-filtered.gvcf.gz | |
ls {son,dad,mom}/*_wgs_chr21.gvcf.gz | xargs -L1 tabix -@8 -p vcf | |
# Similarly, make smaller CRAMs: | |
samtools view -@8 -T ref/hg38_chr21.fa -C -o son/son_wgs_chr21.cram data/TSPF-HG002-10B-2-v13-Rep1.cram chr21 chrUn_GL000220v1 | |
samtools view -@8 -T ref/hg38_chr21.fa -C -o dad/dad_wgs_chr21.cram data/TSPF-HG003-10B-2-v13-Rep1.cram chr21 chrUn_GL000220v1 | |
samtools view -@8 -T ref/hg38_chr21.fa -C -o mom/mom_wgs_chr21.cram data/TSPF-HG004-10B-2-v13-Rep1.cram chr21 chrUn_GL000220v1 | |
ls {son,dad,mom}/*_wgs_chr21.cram | xargs -L1 samtools index -@8 | |
# Unpack paired-end reads into separate FASTQs per read-group: | |
for sm in son dad mom; do | |
samtools sort -@8 -n -O CRAM -l 1 -o $sm/$sm\_wgs_chr21.tmp.cram $sm/$sm\_wgs_chr21.cram | |
picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --REFERENCE_SEQUENCE ref/hg38_chr21.fa --INPUT $sm/$sm\_wgs_chr21.tmp.cram --RG_TAG ID --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR $sm | |
rm -f $sm/$sm\_wgs_chr21.tmp.cram | |
done | |
# Rename FASTQs to follow Illumina's naming scheme: | |
# [SAMPLE NAME]_S[SAMPLE NUMBER]_L[LANE NUMBER (0-padded to 3 digits)]_R[READ NUMBER (either 1 or 2)]_001.fastq.gz | |
ls son/*.fastq.gz | perl -ne 'chomp; ($l,$r)=m/\w{8}.\w{8}.(\d)_(\d)/; print "mv $_ son/son_S2\_L00$l\_R$r\_001.fastq.gz\n"' | bash | |
ls dad/*.fastq.gz | perl -ne 'chomp; ($l,$r)=m/\w{8}.\w{8}.(\d)_(\d)/; print "mv $_ dad/dad_S3\_L00$l\_R$r\_001.fastq.gz\n"' | bash | |
ls mom/*.fastq.gz | perl -ne 'chomp; ($l,$r)=m/\w{8}.\w{8}.(\d)_(\d)/; print "mv $_ mom/mom_S4\_L00$l\_R$r\_001.fastq.gz\n"' | bash | |
# Combine test dataset into a tar and upload for sharing: | |
tar -cf test_trio_wgs.tar ref son dad mom readme.txt | |
scp test_trio_wgs.tar data.cyri.ac:/var/www/cyri.ac/html/ | |
# These test data should now be easily downlodable from https://data.cyri.ac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment