Skip to content

Instantly share code, notes, and snippets.

@aaronwolen
Created July 15, 2021 20:07
Show Gist options
  • Save aaronwolen/80307425a2f98ff6207b8165d633e49a to your computer and use it in GitHub Desktop.
Save aaronwolen/80307425a2f98ff6207b8165d633e49a to your computer and use it in GitHub Desktop.
Modify ClinVar VCF for ingestion into TileDB-VCF
# genome reference
fai_url := s3://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai
fai_file := $(notdir $(fai_url))
# original clinvar vcf
vcf_url := ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20210710.vcf.gz
raw_vcf := $(notdir $(vcf_url))
# modified vcf file
mod_vcf := $(raw_vcf:%.vcf.gz=%_modified.vcf.gz)
all: $(fai_file) $(raw_vcf) $(mod_vcf)
$(mod_vcf): temp.vcf.gz temp.vcf.gz.tbi $(fai_file)
@echo "Fix contig lengths in $@"
@bcftools reheader -f $(fai_file) $< -o $@
@bcftools index --tbi $@
temp.vcf: $(raw_vcf) $(addsuffix .tbi, $(raw_vcf))
@echo "Creating modified VCF: $@"
@echo "..create new header with FORMAT/GT defined in $@"
@bcftools view --header-only $< | grep "^##" > $@
@echo "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" >> $@
@echo "..add dummy sample column to $@"
@echo "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tsample1" >> $@
@bcftools view --no-header $< \
| grep -v '^NW_009646201.1' \
| awk 'BEGIN{OFS="\t"}{print $$0, "GT", "./."}' >> $@
$(raw_vcf):
@echo "Downloading raw VCF $@"
@curl -o $@ $(vcf_url)
$(fai_file):
@echo "Downloading $@"
@aws s3 cp $(fai_url) $@
@echo "Removing chr prefixes"
@sed -i '' 's|^chr||g' $@
@echo "Renaming mitochondrial chr to MT"
@sed -i '' 's|^M|MT|g' $@
%.vcf.gz: %.vcf
@echo "Compressing $<"
@bcftools view -Oz -o $@ $<
%.vcf.gz.tbi: %.vcf.gz
@echo "Creating index: $@"
@bcftools index --tbi $<
clean:
rm -rf $(fai_file) $(mod_vcf) $(raw_vcf) *.tbi
.PHONY: clean
.INTERMEDIATE: temp.vcf temp.vcf.gz temp.vcf.gz.tbi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment