edawson · February 5, 2022 15:15
diff --git a/v3.7_bam2fq_fq2bam b/v3.7_bam2fq_fq2bam

 #############
 ## Download the 30X hg19-aligned bam from Google's public sequencing of HG002
 ## and the respective BAI file.
 #############

 wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam
 wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam.bai


 #############
 ## Prepare the references so we can realign reads
 #############

 ## Download the original hg19 / hsd37d5 reference
 ## and create and FAI index
 wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
 gunzip hs37d5.fa.gz
 samtools faidx hs37d5.fa

 ## Download GRCh38
 wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
 gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

 ## Make a .fai index using samtools faidx
 samtools faidx GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

 ## Create the BWA indices
 bwa index GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

 ## Download the Gold Standard indels from 1kg to use as your known-sites file.
 wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz

 ## Also grab the tabix index for the file
 wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi


 ############
 ## Run the bam2fq tool to extract reads from the BAM file
 ## Adjust the --num-threads argument to reflect the number of cores on your system.
 ## With 8 GPUs and 64 vCPUs this should take ~45 minutes.
 ############
 time pbrun bam2fq \
 --ref hs37d5.fa \
 --in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam \
 --out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq \
 --num-threads 64


 ##############
 ## Run the fq2bam tool to align reads to GRCh38
 ##############
 time pbrun fq2bam \
 --in-fq HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz \
 --ref Homo_sapiens_assembly38.fasta \
 --knownSites Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
 --out-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
 --out-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt

	#############
	## Download the 30X hg19-aligned bam from Google's public sequencing of HG002
	## and the respective BAI file.
	#############

	wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam
	wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam.bai


	#############
	## Prepare the references so we can realign reads
	#############

	## Download the original hg19 / hsd37d5 reference
	## and create and FAI index
	wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
	gunzip hs37d5.fa.gz
	samtools faidx hs37d5.fa

	## Download GRCh38
	wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
	gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

	## Make a .fai index using samtools faidx
	samtools faidx GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

	## Create the BWA indices
	bwa index GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

	## Download the Gold Standard indels from 1kg to use as your known-sites file.
	wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz

	## Also grab the tabix index for the file
	wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi


	############
	## Run the bam2fq tool to extract reads from the BAM file
	## Adjust the --num-threads argument to reflect the number of cores on your system.
	## With 8 GPUs and 64 vCPUs this should take ~45 minutes.
	############
	time pbrun bam2fq \
	--ref hs37d5.fa \
	--in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam \
	--out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq \
	--num-threads 64


	##############
	## Run the fq2bam tool to align reads to GRCh38
	##############
	time pbrun fq2bam \
	--in-fq HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz \
	--ref Homo_sapiens_assembly38.fasta \
	--knownSites Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \
	--out-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
	--out-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt