ckandoth · February 17, 2025 09:10
diff --git a/ngs_test_data.sh b/ngs_test_data.sh
 # GOAL: Create test data for CI/CD of a germline variant calling pipeline (FASTQ to gVCF)

 # Steps below were performed on Ubuntu 24.04, but should be reproducible on any Linux distro

 # Download and install micromamba under your home directory and logout:
 "${SHELL}" <(curl -L micro.mamba.pm/install.sh)

 # Log back in to add micromamba to your $PATH, and then use it to install the tools we need:
 micromamba create -y -n bio -c conda-forge -c bioconda htslib==1.21 samtools==1.21 bcftools==1.21 picard-slim==3.3.0
 micromamba activate bio

 # bs (BaseSpace CLI) cannot be installed using conda, so let's do it manually:
 curl -L https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs -o $HOME/micromamba/envs/bio/bin/bs
 chmod +x $HOME/micromamba/envs/bio/bin/bs

 # Visit https://basespace.illumina.com and login. Create a free account if needed. Then visit https://basespace.illumina.com/s/htWXpgEKrRu6
 # and click "Accept" to add the free demo WGS data to your BaseSpace account. Use "bs auth" to login before using the "bs" commands below.

 # Download WGS CRAMs and gVCFs from Dragen 4.3.13 on the GIAB Ashkenazi Jewish Trio (HG002, HG003, HG004):
 mkdir data
 bs list dataset --terse --project-name "TruSeq-PCRfree-HG001-HG007-10B-2-v13" --is-type "illumina.dragen.complete.v0.4.3" --filter-term "TSPF-HG002-10B-2-v13-Rep1|TSPF-HG003-10B-2-v13-Rep1|TSPF-HG004-10B-2-v13-Rep1" |\
    xargs -L1 bs contents dataset --terse --extension cram,crai,gvcf.gz,gvcf.gz.tbi --id |\
    xargs -L1 bs download file --no-metadata -o data --id

 # Download the hg38 reference genome and create a subset with chr21 plus a random unplaced contig:
 mkdir ref
 wget -P ref https://ilmn-dragen-giab-samples.s3.amazonaws.com/FASTA/hg38.fa
 samtools faidx -@8 ref/hg38.fa
 samtools faidx -@8 -o ref/hg38_chr21.fa ref/hg38.fa chr21 chrUn_GL000220v1
 samtools faidx -@8 ref/hg38_chr21.fa
 rm -f ref/hg38.fa ref/hg38.fa.fai

 # Make smaller gVCFs limited to chr21 and the unplaced contig selected above:
 mkdir son dad mom
 bcftools view -r chr21,chrUn_GL000220v1 -O z -l 6 -o son/son_wgs_chr21.gvcf.gz data/TSPF-HG002-10B-2-v13-Rep1.hard-filtered.gvcf.gz
 bcftools view -r chr21,chrUn_GL000220v1 -O z -l 6 -o dad/dad_wgs_chr21.gvcf.gz data/TSPF-HG003-10B-2-v13-Rep1.hard-filtered.gvcf.gz
 bcftools view -r chr21,chrUn_GL000220v1 -O z -l 6 -o mom/mom_wgs_chr21.gvcf.gz data/TSPF-HG004-10B-2-v13-Rep1.hard-filtered.gvcf.gz
 ls {son,dad,mom}/*_wgs_chr21.gvcf.gz | xargs -L1 tabix -@8 -p vcf

 # Similarly, make smaller CRAMs:
 samtools view -@8 -T ref/hg38_chr21.fa -C -o son/son_wgs_chr21.cram data/TSPF-HG002-10B-2-v13-Rep1.cram chr21 chrUn_GL000220v1
 samtools view -@8 -T ref/hg38_chr21.fa -C -o dad/dad_wgs_chr21.cram data/TSPF-HG003-10B-2-v13-Rep1.cram chr21 chrUn_GL000220v1
 samtools view -@8 -T ref/hg38_chr21.fa -C -o mom/mom_wgs_chr21.cram data/TSPF-HG004-10B-2-v13-Rep1.cram chr21 chrUn_GL000220v1
 ls {son,dad,mom}/*_wgs_chr21.cram | xargs -L1 samtools index -@8

 # Unpack paired-end reads into separate FASTQs per read-group:
 for sm in son dad mom; do
    samtools sort -@8 -n -O CRAM -l 1 -o $sm/$sm\_wgs_chr21.tmp.cram $sm/$sm\_wgs_chr21.cram
    picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --REFERENCE_SEQUENCE ref/hg38_chr21.fa --INPUT $sm/$sm\_wgs_chr21.tmp.cram --RG_TAG ID --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR $sm
    rm -f $sm/$sm\_wgs_chr21.tmp.cram
 done

 # Rename FASTQs to follow Illumina's naming scheme:
 # [SAMPLE NAME]_S[SAMPLE NUMBER]_L[LANE NUMBER (0-padded to 3 digits)]_R[READ NUMBER (either 1 or 2)]_001.fastq.gz
 ls son/*.fastq.gz | perl -ne 'chomp; ($l,$r)=m/\w{8}.\w{8}.(\d)_(\d)/; print "mv $_ son/son_S2\_L00$l\_R$r\_001.fastq.gz\n"' | bash
 ls dad/*.fastq.gz | perl -ne 'chomp; ($l,$r)=m/\w{8}.\w{8}.(\d)_(\d)/; print "mv $_ dad/dad_S3\_L00$l\_R$r\_001.fastq.gz\n"' | bash
 ls mom/*.fastq.gz | perl -ne 'chomp; ($l,$r)=m/\w{8}.\w{8}.(\d)_(\d)/; print "mv $_ mom/mom_S4\_L00$l\_R$r\_001.fastq.gz\n"' | bash

 # Combine test dataset into a tar and upload for sharing:
 tar -cf test_trio_wgs.tar ref son dad mom readme.txt
 scp test_trio_wgs.tar data.cyri.ac:/var/www/cyri.ac/html/

 # These test data should now be easily downlodable from https://data.cyri.ac
	# GOAL: Create test data for CI/CD of a germline variant calling pipeline (FASTQ to gVCF)

	# Steps below were performed on Ubuntu 24.04, but should be reproducible on any Linux distro

	# Download and install micromamba under your home directory and logout:
	"${SHELL}" <(curl -L micro.mamba.pm/install.sh)

	# Log back in to add micromamba to your $PATH, and then use it to install the tools we need:
	micromamba create -y -n bio -c conda-forge -c bioconda htslib==1.21 samtools==1.21 bcftools==1.21 picard-slim==3.3.0
	micromamba activate bio

	# bs (BaseSpace CLI) cannot be installed using conda, so let's do it manually:
	curl -L https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs -o $HOME/micromamba/envs/bio/bin/bs
	chmod +x $HOME/micromamba/envs/bio/bin/bs

	# Visit https://basespace.illumina.com and login. Create a free account if needed. Then visit https://basespace.illumina.com/s/htWXpgEKrRu6
	# and click "Accept" to add the free demo WGS data to your BaseSpace account. Use "bs auth" to login before using the "bs" commands below.

	# Download WGS CRAMs and gVCFs from Dragen 4.3.13 on the GIAB Ashkenazi Jewish Trio (HG002, HG003, HG004):
	mkdir data
	bs list dataset --terse --project-name "TruSeq-PCRfree-HG001-HG007-10B-2-v13" --is-type "illumina.dragen.complete.v0.4.3" --filter-term "TSPF-HG002-10B-2-v13-Rep1\|TSPF-HG003-10B-2-v13-Rep1\|TSPF-HG004-10B-2-v13-Rep1" \|\
	xargs -L1 bs contents dataset --terse --extension cram,crai,gvcf.gz,gvcf.gz.tbi --id \|\
	xargs -L1 bs download file --no-metadata -o data --id

	# Download the hg38 reference genome and create a subset with chr21 plus a random unplaced contig:
	mkdir ref
	wget -P ref https://ilmn-dragen-giab-samples.s3.amazonaws.com/FASTA/hg38.fa
	samtools faidx -@8 ref/hg38.fa
	samtools faidx -@8 -o ref/hg38_chr21.fa ref/hg38.fa chr21 chrUn_GL000220v1
	samtools faidx -@8 ref/hg38_chr21.fa
	rm -f ref/hg38.fa ref/hg38.fa.fai

	# Make smaller gVCFs limited to chr21 and the unplaced contig selected above:
	mkdir son dad mom
	bcftools view -r chr21,chrUn_GL000220v1 -O z -l 6 -o son/son_wgs_chr21.gvcf.gz data/TSPF-HG002-10B-2-v13-Rep1.hard-filtered.gvcf.gz
	bcftools view -r chr21,chrUn_GL000220v1 -O z -l 6 -o dad/dad_wgs_chr21.gvcf.gz data/TSPF-HG003-10B-2-v13-Rep1.hard-filtered.gvcf.gz
	bcftools view -r chr21,chrUn_GL000220v1 -O z -l 6 -o mom/mom_wgs_chr21.gvcf.gz data/TSPF-HG004-10B-2-v13-Rep1.hard-filtered.gvcf.gz
	ls {son,dad,mom}/*_wgs_chr21.gvcf.gz \| xargs -L1 tabix -@8 -p vcf

	# Similarly, make smaller CRAMs:
	samtools view -@8 -T ref/hg38_chr21.fa -C -o son/son_wgs_chr21.cram data/TSPF-HG002-10B-2-v13-Rep1.cram chr21 chrUn_GL000220v1
	samtools view -@8 -T ref/hg38_chr21.fa -C -o dad/dad_wgs_chr21.cram data/TSPF-HG003-10B-2-v13-Rep1.cram chr21 chrUn_GL000220v1
	samtools view -@8 -T ref/hg38_chr21.fa -C -o mom/mom_wgs_chr21.cram data/TSPF-HG004-10B-2-v13-Rep1.cram chr21 chrUn_GL000220v1
	ls {son,dad,mom}/*_wgs_chr21.cram \| xargs -L1 samtools index -@8

	# Unpack paired-end reads into separate FASTQs per read-group:
	for sm in son dad mom; do
	samtools sort -@8 -n -O CRAM -l 1 -o $sm/$sm\_wgs_chr21.tmp.cram $sm/$sm\_wgs_chr21.cram
	picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --REFERENCE_SEQUENCE ref/hg38_chr21.fa --INPUT $sm/$sm\_wgs_chr21.tmp.cram --RG_TAG ID --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR $sm
	rm -f $sm/$sm\_wgs_chr21.tmp.cram
	done

	# Rename FASTQs to follow Illumina's naming scheme:
	# [SAMPLE NAME]_S[SAMPLE NUMBER]_L[LANE NUMBER (0-padded to 3 digits)]_R[READ NUMBER (either 1 or 2)]_001.fastq.gz
	ls son/*.fastq.gz \| perl -ne 'chomp; ($l,$r)=m/\w{8}.\w{8}.(\d)_(\d)/; print "mv $_ son/son_S2\_L00$l\_R$r\_001.fastq.gz\n"' \| bash
	ls dad/*.fastq.gz \| perl -ne 'chomp; ($l,$r)=m/\w{8}.\w{8}.(\d)_(\d)/; print "mv $_ dad/dad_S3\_L00$l\_R$r\_001.fastq.gz\n"' \| bash
	ls mom/*.fastq.gz \| perl -ne 'chomp; ($l,$r)=m/\w{8}.\w{8}.(\d)_(\d)/; print "mv $_ mom/mom_S4\_L00$l\_R$r\_001.fastq.gz\n"' \| bash

	# Combine test dataset into a tar and upload for sharing:
	tar -cf test_trio_wgs.tar ref son dad mom readme.txt
	scp test_trio_wgs.tar data.cyri.ac:/var/www/cyri.ac/html/

	# These test data should now be easily downlodable from https://data.cyri.ac