sirselim · January 24, 2025 08:14 · misrak · Dec 13, 2019 · dariogf · Feb 27, 2020
diff --git a/dbNSFP_pipeline_build.sh b/dbNSFP_pipeline_build.sh
 #!/bin/bash
 ## small bash script to download and reformat dbNSFP for pipeline 
 ## Miles Benton
 ## created: 2018-01-13
 ## modified: 2019-08-21

 # Set to dbNSFP version to download and build
 version="4.0a"
 #TODO: add an option to 'scrape' this from the url to always return latest version
 # define thread number for parallel processing where able
 THREADS=6

 # Download dbNSFP database
 wget -O dbNSFP${version}.zip "ftp://dbnsfp:[email protected]/dbNSFP${version}.zip"

 # Uncompress
 unzip dbNSFP${version}.zip

 # grab header
 zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz

 ### this section will produce data for hg38 capable pipelines
 ## hg38 version

 # Create a single file version
 # NOTE: bgzip parameter -@ X represents number of threads
 cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ ${THREADS} > dbNSFPv${version}_custom.gz
 #TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them)

 # add header back into file
 cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz

 # Create tabix index
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz

 # test annotation
 # java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
 #TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates.
 #TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build. 

 # clean up
 #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
 #/END hg38
 ###

 ### this section will produce data for hg19 capable pipelines
 ## hg19 version
 # for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
 # this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
 # NOTE: bgzip parameter -@ X represents number of threads

 # create a file with ordered chrosome names
 echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt

 # set awk params as variable for parallel
 awkBody1='$8 != "."'
 awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}'

 # parallel implementation of the hg19 format and sort
 parallel -j 4 \
 "zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \
 awk '${awkBody1}' | \
 awk '${awkBody2}' | \
 LC_ALL=C sort --parallel=2 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \
 bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
 #TODO: need to implement a defined approach to GNU parallel, can't use $THREADS 

 # cat all files back together (should retain sort order)
 cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz

 # Create tabix index
 tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

 # test hg19 annotation
 # java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf

 # clean up
 #TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
 #/END hg38
 ###
	#!/bin/bash
	## small bash script to download and reformat dbNSFP for pipeline
	## Miles Benton
	## created: 2018-01-13
	## modified: 2019-08-21

	# Set to dbNSFP version to download and build
	version="4.0a"
	#TODO: add an option to 'scrape' this from the url to always return latest version
	# define thread number for parallel processing where able
	THREADS=6

	# Download dbNSFP database
	wget -O dbNSFP${version}.zip "ftp://dbnsfp:[email protected]/dbNSFP${version}.zip"

	# Uncompress
	unzip dbNSFP${version}.zip

	# grab header
	zcat dbNSFP${version}_variant.chr1.gz \| head -n 1 \| bgzip > header.gz

	### this section will produce data for hg38 capable pipelines
	## hg38 version

	# Create a single file version
	# NOTE: bgzip parameter -@ X represents number of threads
	cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz \| zgrep -v '#chr' \| bgzip -@ ${THREADS} > dbNSFPv${version}_custom.gz
	#TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them)

	# add header back into file
	cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz

	# Create tabix index
	tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz

	# test annotation
	# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf
	#TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates.
	#TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build.

	# clean up
	#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
	#/END hg38
	###

	### this section will produce data for hg19 capable pipelines
	## hg19 version
	# for hg19 (coordinate data is located in columns 8 [chr] and 9 [position])
	# this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output
	# NOTE: bgzip parameter -@ X represents number of threads

	# create a file with ordered chrosome names
	echo {1..22} X Y M \| tr ' ' '\n' > chromosomeList.txt

	# set awk params as variable for parallel
	awkBody1='$8 != "."'
	awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = $1; print;}'

	# parallel implementation of the hg19 format and sort
	parallel -j 4 \
	"zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz \| \
	awk '${awkBody1}' \| \
	awk '${awkBody2}' \| \
	LC_ALL=C sort --parallel=2 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip \| \
	bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt
	#TODO: need to implement a defined approach to GNU parallel, can't use $THREADS

	# cat all files back together (should retain sort order)
	cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz

	# Create tabix index
	tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz

	# test hg19 annotation
	# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf

	# clean up
	#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates')
	#/END hg38
	###