Last active
March 1, 2023 19:43
-
-
Save ckandoth/9416c6e76ca9ca3f0e5716762e9e0b7a to your computer and use it in GitHub Desktop.
Create a small test dataset for CI/CD of cancer bioinformatics tools/pipelines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# GOAL: Create a small test dataset for CI/CD of cancer bioinformatics tools/pipelines | |
# Prerequisites: A clean Linux environment with a working internet connection | |
# Download and install mambaforge into a folder under your home directory: | |
curl -L https://github.com/conda-forge/miniforge/releases/download/4.14.0-0/Mambaforge-Linux-x86_64.sh -o /tmp/mambaforge.sh | |
sh /tmp/mambaforge.sh -bfp $HOME/mambaforge && rm -f /tmp/mambaforge.sh | |
# Add the following to your ~/.bashrc file to activate the base environment whenever you login: | |
if [ -f "$HOME/mambaforge/etc/profile.d/conda.sh" ]; then | |
. $HOME/mambaforge/etc/profile.d/conda.sh | |
conda activate | |
fi | |
# Logout and login to activate the base enivironment, and then install the required packages: | |
mamba install -y -c bioconda htslib==1.13 samtools==1.13 picard-slim==2.25.7 nextflow==21.04.0 singularity==3.7.1 | |
# bs (BaseSpace CLI) cannot be installed using mamba, so let's do it manually: | |
curl -L https://launch.basespace.illumina.com/CLI/latest/amd64-linux/bs -o $HOME/mambaforge/bin/bs | |
chmod +x $HOME/mambaforge/bin/bs | |
# Visit https://basespace.illumina.com and login. Create a free account if needed. Then visit https://basespace.illumina.com/s/RJSCrml92sQt | |
# and click "Accept" to add the free demo WGS data to your BaseSpace account. Run "bs auth" to login to your command-line interface (CLI). | |
# Use the BaseSpace CLI to download the GRCh38 WGS BAMs for TN-pairs HCC1187C and HCC1187BL: | |
mkdir data | |
bs list dataset --terse --is-type "illumina.isaac.v8.1" --project-name "HiSeq 2000: Tumor Normal WGS (HCC1187 & HCC2218)" |\ | |
xargs -L1 bs contents dataset --terse --extension bam,bai --id |\ | |
xargs -L1 bs download file --no-metadata -o data --id | |
# Make smaller BAMs limited to 2Mbp around RUNX1 and sorted by read name: | |
samtools view -@8 -h data/HCC1187C_Proband_S1.bam chr21:34000000-36000000 | samtools sort -@8 -n -o data/chr21_tum_wgs.bam | |
samtools view -@8 -h data/HCC1187BL_Proband_S1.bam chr21:34000000-36000000 | samtools sort -@8 -n -o data/chr21_nrm_wgs.bam | |
# Unpack paired-end reads into separate FASTQs per read-group: | |
mkdir tum nrm | |
picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --INPUT data/chr21_tum_wgs.bam --RG_TAG PU --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR tum | |
picard SamToFastq --VALIDATION_STRINGENCY SILENT --MAX_RECORDS_IN_RAM 8000000 --INPUT data/chr21_nrm_wgs.bam --RG_TAG PU --OUTPUT_PER_RG true --COMPRESS_OUTPUTS_PER_RG true --OUTPUT_DIR nrm | |
# Rename FASTQs to follow Illumina/Casava naming scheme: | |
# [SAMPLE NAME]_[BARCODE SEQUENCE]_L[LANE NUMBER (0-padded to 3 digits)]_R[READ NUMBER (either 1 or 2)]_[SET NUMBER (0-padded to 3 digits)].fastq.gz | |
ls tum/*.fastq.gz | perl -ne 'chomp; ($f,$l,$r)=m/(\w+).(\d)\w+(\d)/; print "mv $_ tum/tum_$f\_L00$l\_R$r\_001.fastq.gz\n"' | bash | |
ls nrm/*.fastq.gz | perl -ne 'chomp; ($f,$l,$r)=m/(\w+).(\d)\w+(\d)/; print "mv $_ nrm/nrm_$f\_L00$l\_R$r\_001.fastq.gz\n"' | bash | |
# The tumor has 8 lanes of reads and the normal has 4 lanes. Downsample them to 2 lanes for tumor and 1 lane for normal: | |
rm nrm/nrm_C0JD1ACXX_L00{2,3,4}* tum/tum_C0G5BACXX_L00{3,4,5,6,7,8}* | |
# Create a tab-delimited manifest of these FASTQs compatible with https://nf-co.re/sarek: | |
echo -e "HCC1187\tXX\t0\tnrm\tC0JD1ACXX.1\tnrm/nrm_C0JD1ACXX_L001_R1_001.fastq.gz\tnrm/nrm_C0JD1ACXX_L001_R2_001.fastq.gz" > tum_nrm_wgs.tsv | |
echo -e "HCC1187\tXX\t1\ttum\tC0G5BACXX.1\ttum/tum_C0G5BACXX_L001_R1_001.fastq.gz\ttum/tum_C0G5BACXX_L001_R2_001.fastq.gz" >> tum_nrm_wgs.tsv | |
echo -e "HCC1187\tXX\t1\ttum\tC0G5BACXX.2\ttum/tum_C0G5BACXX_L002_R1_001.fastq.gz\ttum/tum_C0G5BACXX_L002_R2_001.fastq.gz" >> tum_nrm_wgs.tsv | |
# Process this data using the nextflow Sarek pipeline which also downloads a GRCh38 resource bundle for us: | |
nextflow run -profile singularity -w data/nf -r 2.7.1 nf-core/sarek --genome GRCh38 --no_intervals --skip_qc all --tools FreeBayes --input tum_nrm_wgs.tsv --outdir data/sarek | |
# Find the GRCh38 reference, subset to chr21 and an unplaced contig with many mapped reads, and index: | |
mkdir ref | |
samtools faidx -o ref/GRCh38_chr21.fa $(find data/nf/stage -type f -name Homo_sapiens_assembly38.fasta) chr21 chrUn_GL000220v1 | |
samtools faidx ref/GRCh38_chr21.fa | |
picard CreateSequenceDictionary --REFERENCE ref/GRCh38_chr21.fa | |
# Find the SNP/Indel reference VCFs, subset to chr21, and tabix index them: | |
bgzip -dc $(find data/nf/stage -type f -name dbsnp_146.hg38.vcf.gz) | grep -E "^#|^chr21" | bgzip -c > ref/dbsnp_146_GRCh38_chr21.vcf.gz | |
tabix -p vcf ref/dbsnp_146_GRCh38_chr21.vcf.gz | |
bgzip -dc $(find data/nf/stage -type f -name Homo_sapiens_assembly38.known_indels.vcf.gz) | grep -E "^#|^chr21" | bgzip -c > ref/known_indels_GRCh38_chr21.vcf.gz | |
tabix -p vcf ref/known_indels_GRCh38_chr21.vcf.gz | |
bgzip -dc $(find data/nf/stage -type f -name Mills_and_1000G_gold_standard.indels.hg38.vcf.gz) | grep -E "^#|^chr21" | bgzip -c > ref/gold_standard_indels_GRCh38_chr21.vcf.gz | |
tabix -p vcf ref/gold_standard_indels_GRCh38_chr21.vcf.gz | |
# Move the Sarek BAMs/VCFs and their indexes into new subfolders: | |
mkdir bam vcf | |
find data/sarek -type f -name *.recal.bam -exec mv {} bam/ \; | |
find data/sarek -type f -name *.recal.bam.bai -exec mv {} bam/ \; | |
find data/sarek -type f -name *.vcf.gz -exec mv {} vcf/ \; | |
find data/sarek -type f -name *.vcf.gz.tbi -exec mv {} vcf/ \; | |
# Combine test dataset into a tar and upload for sharing: | |
tar -cf test_tum_nrm_wgs.tar bam nrm ref tum vcf tum_nrm_wgs.tsv readme.txt | |
scp test_tum_nrm_wgs.tar data.cyri.ac:/var/www/cyri.ac/html/ | |
# Separately upload one set of tumor FASTQs for easy access: | |
scp tum/tum_C0G5BACXX_L001_R1_001.fastq.gz data.cyri.ac:/var/www/cyri.ac/html/test_L001_R1_001.fastq.gz | |
scp tum/tum_C0G5BACXX_L001_R2_001.fastq.gz data.cyri.ac:/var/www/cyri.ac/html/test_L001_R2_001.fastq.gz | |
# These test data should now be easily downlodable from https://data.cyri.ac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment