Skip to content

Instantly share code, notes, and snippets.

@ckandoth
Last active March 1, 2023 19:43
Show Gist options
  • Save ckandoth/9416c6e76ca9ca3f0e5716762e9e0b7a to your computer and use it in GitHub Desktop.
Save ckandoth/9416c6e76ca9ca3f0e5716762e9e0b7a to your computer and use it in GitHub Desktop.
Create a small test dataset for CI/CD of cancer bioinformatics tools/pipelines
# GOAL: Create a small test dataset for CI/CD of cancer bioinformatics tools/pipelines
# Prerequisites: A clean Linux environment with a working internet connection
# Download and install mambaforge into a folder under your home directory:
curl -L https://github.com/conda-forge/miniforge/releases/download/4.14.0-0/Mambaforge-Linux-x86_64.sh -o /tmp/mambaforge.sh
sh /tmp/mambaforge.sh -bfp $HOME/mambaforge && rm -f /tmp/mambaforge.sh
# Add the following to your ~/.bashrc file to activate the base environment whenever you login:
if [ -f "$HOME/mambaforge/etc/profile.d/conda.sh" ]; then
. $HOME/mambaforge/etc/profile.d/conda.sh
conda activate
fi
# Logout and login to activate the base enivironment, and then install the required packages:
mamba install -y -c bioconda htslib==1.13 samtools==1.13 picard-slim==2.25.7 nextflow==21.04.0 singularity==3.7.1
# bs (BaseSpace CLI) cannot be installed using mamba, so let's do it manually:
curl -L https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs -o $HOME/mambaforge/bin/bs
chmod +x $HOME/mambaforge/bin/bs
# Visit https://basespace.illumina.com and login. Create a free account if needed. Then visit https://basespace.illumina.com/s/RJSCrml92sQt
# and click "Accept" to add the free demo WGS data to your BaseSpace account. Run "bs auth" to login to your command-line interface (CLI).
# Use the BaseSpace CLI to download the GRCh38 WGS BAMs for TN-pairs HCC1187C and HCC1187BL:
mkdir data
bs list dataset --terse --is-type "illumina.isaac.v8.1" --project-name "HiSeq 2000: Tumor Normal WGS (HCC1187 & HCC2218)" |\
xargs -L1 bs contents dataset --terse --extension bam,bai --id |\
xargs -L1 bs download file --no-metadata -o data --id
# Make smaller BAMs limited to 2Mbp around RUNX1 and sorted by read name:
samtools view -@8 -h data/HCC1187C_Proband_S1.bam chr21:34000000-36000000 | samtools sort -@8 -n -o data/chr21_tum_wgs.bam
samtools view -@8 -h data/HCC1187BL_Proband_S1.bam chr21:34000000-36000000 | samtools sort -@8 -n -o data/chr21_nrm_wgs.bam
# Unpack paired-end reads into separate FASTQs per read-group:
mkdir tum nrm
picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --INPUT data/chr21_tum_wgs.bam --RG_TAG PU --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR tum
picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --INPUT data/chr21_nrm_wgs.bam --RG_TAG PU --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR nrm
# Rename FASTQs to follow Illumina/Casava naming scheme:
# [SAMPLE NAME]_[BARCODE SEQUENCE]_L[LANE NUMBER (0-padded to 3 digits)]_R[READ NUMBER (either 1 or 2)]_[SET NUMBER (0-padded to 3 digits)].fastq.gz
ls tum/*.fastq.gz | perl -ne 'chomp; ($f,$l,$r)=m/(\w+).(\d)\w+(\d)/; print "mv $_ tum/tum_$f\_L00$l\_R$r\_001.fastq.gz\n"' | bash
ls nrm/*.fastq.gz | perl -ne 'chomp; ($f,$l,$r)=m/(\w+).(\d)\w+(\d)/; print "mv $_ nrm/nrm_$f\_L00$l\_R$r\_001.fastq.gz\n"' | bash
# The tumor has 8 lanes of reads and the normal has 4 lanes. Downsample them to 2 lanes for tumor and 1 lane for normal:
rm nrm/nrm_C0JD1ACXX_L00{2,3,4}* tum/tum_C0G5BACXX_L00{3,4,5,6,7,8}*
# Create a tab-delimited manifest of these FASTQs compatible with https://nf-co.re/sarek:
echo -e "HCC1187\tXX\t0\tnrm\tC0JD1ACXX.1\tnrm/nrm_C0JD1ACXX_L001_R1_001.fastq.gz\tnrm/nrm_C0JD1ACXX_L001_R2_001.fastq.gz" > tum_nrm_wgs.tsv
echo -e "HCC1187\tXX\t1\ttum\tC0G5BACXX.1\ttum/tum_C0G5BACXX_L001_R1_001.fastq.gz\ttum/tum_C0G5BACXX_L001_R2_001.fastq.gz" >> tum_nrm_wgs.tsv
echo -e "HCC1187\tXX\t1\ttum\tC0G5BACXX.2\ttum/tum_C0G5BACXX_L002_R1_001.fastq.gz\ttum/tum_C0G5BACXX_L002_R2_001.fastq.gz" >> tum_nrm_wgs.tsv
# Process this data using the nextflow Sarek pipeline which also downloads a GRCh38 resource bundle for us:
nextflow run -profile singularity -w data/nf -r 2.7.1 nf-core/sarek --genome GRCh38 --no_intervals --skip_qc all --tools FreeBayes --input tum_nrm_wgs.tsv --outdir data/sarek
# Find the GRCh38 reference, subset to chr21 and an unplaced contig with many mapped reads, and index:
mkdir ref
samtools faidx -o ref/GRCh38_chr21.fa $(find data/nf/stage -type f -name Homo_sapiens_assembly38.fasta) chr21 chrUn_GL000220v1
samtools faidx ref/GRCh38_chr21.fa
picard CreateSequenceDictionary --REFERENCE ref/GRCh38_chr21.fa
# Find the SNP/Indel reference VCFs, subset to chr21, and tabix index them:
bgzip -dc $(find data/nf/stage -type f -name dbsnp_146.hg38.vcf.gz) | grep -E "^#|^chr21" | bgzip -c > ref/dbsnp_146_GRCh38_chr21.vcf.gz
tabix -p vcf ref/dbsnp_146_GRCh38_chr21.vcf.gz
bgzip -dc $(find data/nf/stage -type f -name Homo_sapiens_assembly38.known_indels.vcf.gz) | grep -E "^#|^chr21" | bgzip -c > ref/known_indels_GRCh38_chr21.vcf.gz
tabix -p vcf ref/known_indels_GRCh38_chr21.vcf.gz
bgzip -dc $(find data/nf/stage -type f -name Mills_and_1000G_gold_standard.indels.hg38.vcf.gz) | grep -E "^#|^chr21" | bgzip -c > ref/gold_standard_indels_GRCh38_chr21.vcf.gz
tabix -p vcf ref/gold_standard_indels_GRCh38_chr21.vcf.gz
# Move the Sarek BAMs/VCFs and their indexes into new subfolders:
mkdir bam vcf
find data/sarek -type f -name *.recal.bam -exec mv {} bam/ \;
find data/sarek -type f -name *.recal.bam.bai -exec mv {} bam/ \;
find data/sarek -type f -name *.vcf.gz -exec mv {} vcf/ \;
find data/sarek -type f -name *.vcf.gz.tbi -exec mv {} vcf/ \;
# Combine test dataset into a tar and upload for sharing:
tar -cf test_tum_nrm_wgs.tar bam nrm ref tum vcf tum_nrm_wgs.tsv readme.txt
scp test_tum_nrm_wgs.tar data.cyri.ac:/var/www/cyri.ac/html/
# Separately upload one set of tumor FASTQs for easy access:
scp tum/tum_C0G5BACXX_L001_R1_001.fastq.gz data.cyri.ac:/var/www/cyri.ac/html/test_L001_R1_001.fastq.gz
scp tum/tum_C0G5BACXX_L001_R2_001.fastq.gz data.cyri.ac:/var/www/cyri.ac/html/test_L001_R2_001.fastq.gz
# These test data should now be easily downlodable from https://data.cyri.ac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment