-
-
Save dariogf/5e7171586cd108a7d053b2132b20e490 to your computer and use it in GitHub Desktop.
small bash script to download dbNSFP 'database' and wrangle into format for pipeline annotation process
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## small bash script to download and reformat dbNSFP for pipeline | |
## Miles Benton | |
## created: 2018-01-13 | |
## modified: 2019-08-21 | |
# Set to dbNSFP version to download and build | |
version="4.0a" | |
#TODO: add an option to 'scrape' this from the url to always return latest version | |
# define thread number for parallel processing where able | |
THREADS=6 | |
# Download dbNSFP database | |
wget -O dbNSFP${version}.zip "ftp://dbnsfp:[email protected]/dbNSFP${version}.zip" | |
# Uncompress | |
unzip dbNSFP${version}.zip | |
# grab header | |
zcat dbNSFP${version}_variant.chr1.gz | head -n 1 | bgzip > header.gz | |
### this section will produce data for hg38 capable pipelines | |
## hg38 version | |
# Create a single file version | |
# NOTE: bgzip parameter -@ X represents number of threads | |
cat dbNSFP${version}_variant.chr{1..22}.gz dbNSFP${version}_variant.chrX.gz dbNSFP${version}_variant.chrY.gz dbNSFP${version}_variant.chrM.gz | zgrep -v '#chr' | bgzip -@ ${THREADS} > dbNSFPv${version}_custom.gz | |
#TODO: there must be a 'nicer' way of ordering the input into the cat (to include the X,Y and M chrs without explicitly stating them) | |
# add header back into file | |
cat header.gz dbNSFPv${version}_custom.gz > dbNSFPv${version}_custombuild.gz | |
# Create tabix index | |
tabix -s 1 -b 2 -e 2 dbNSFPv${version}_custombuild.gz | |
# test annotation | |
# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}_custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf | |
#TODO: provide actual unit test files for testing purposes, i.e. a section of public data with known annotation rates. | |
#TODO: the above is currently a placeholder but it had it's intended purpose in terms of identifying incorrect genome build. | |
# clean up | |
#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates') | |
#/END hg38 | |
### | |
### this section will produce data for hg19 capable pipelines | |
## hg19 version | |
# for hg19 (coordinate data is located in columns 8 [chr] and 9 [position]) | |
# this takes output from above, filters out any variants with no hg19 coords and then sorts on hg19 chr and position, and then bgzips output | |
# NOTE: bgzip parameter -@ X represents number of threads | |
# create a file with ordered chrosome names | |
echo {1..22} X Y M | tr ' ' '\n' > chromosomeList.txt | |
# set awk params as variable for parallel | |
awkBody1='$8 != "."' | |
awkBody2='BEGIN{FS=OFS="\t"} {t = $2; $2 = $9; $9 = t; x = $1; $1 = $8; $8 = x; print;}' | |
# parallel implementation of the hg19 format and sort | |
parallel -j 4 \ | |
"zgrep -v '#chr' dbNSFP${version}_variant.chr{}.gz | \ | |
awk '${awkBody1}' | \ | |
awk '${awkBody2}' | grep '^{}' | \ | |
LC_ALL=C sort --parallel=2 -n -S 1G -T . -k 1,1 -k 2,2 --compress-program=gzip | \ | |
bgzip -@ 2 > dbNSFPv${version}.chr{}.hg19.custombuild.gz" :::: chromosomeList.txt | |
#TODO: need to implement a defined approach to GNU parallel, can't use $THREADS | |
# cat all files back together (should retain sort order) | |
cat header.gz dbNSFPv${version}.chr{1..22}.hg19.custombuild.gz dbNSFPv${version}.chrX.hg19.custombuild.gz dbNSFPv${version}.chrY.hg19.custombuild.gz dbNSFPv${version}.chrM.hg19.custombuild.gz > dbNSFPv${version}.hg19.custombuild.gz | |
# Create tabix index | |
tabix -s 1 -b 2 -e 2 dbNSFPv${version}.hg19.custombuild.gz | |
# test hg19 annotation | |
# java -jar ~/install/snpEff/SnpSift.jar dbnsfp -v -db /mnt/dbNSFP/hg19/dbNSFPv${version}.hg19.custombuild.gz test/chr1_test.vcf > test/chr1_test_anno.vcf | |
# clean up | |
#TODO: add clean up step to rm all intermediate files after testing confirmed working (i.e. correct annotation 'rates') | |
#/END hg38 | |
### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment