-
-
Save arupgsh/3a14a27ce914052c5921c30e6ab206d2 to your computer and use it in GitHub Desktop.
BCFtools cheat sheet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*bcftools filter | |
*Filter variants per region (in this example, print out only variants mapped to chr1 and chr2) | |
qbcftools filter -r1,2 ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.hg38.vcf.gz | |
*printing out info for only 2 samples: | |
bcftools view -s NA20818,NA20819 filename.vcf.gz | |
*printing stats only for variants passing the filter: | |
bcftools view -f PASS filename.vcf.gz | |
*printing variants withoud header: | |
bcftools view -H | |
*printing variants on a particular region: | |
bcftools view -r chr20:1-200000 -s NA20818,NA20819 filename.vcf.gz | |
*print all variants except for the ones falling within region: | |
bcftools view -t ^chr20:1-30000000 ex_bams.samtools.20161231.vcf.gz >out.vcf | |
*view the positions passed in a file (accepted files are .vcf and .bed): | |
bcftools view -R 0002.vcf in.vcf.gz | |
*view the positions passed in a tsv file: | |
bcftools view -R 0002.tsv in.vcf.gz | |
# The format of 0002.tsv: | |
20 79000 80000 | |
20 90000 100000 | |
*now, print out the chr\tpos | |
bcftools query -f '%CHROM\t%POS\n' filename.vcf | |
/ | |
*now, print out the AF INFO field | |
bcftools query -f '%INFO/AF\n' | |
/ | |
#getting a particular annotation from the VCF | |
bcftools query -f '%QUAL\n' 0002.vcf | |
/ | |
#printing chr pos and a particular annotation from a VCF: | |
bcftools query -f '%CHROM\t%POS\t%INFO/DP\n' in.vcf.gz | |
/ | |
#printing out the sets assigned by GATK CombineVariants | |
~/bin/bcftools-1.6/bcftools query -f '%set\n' out_combine.vcf.gz |sort |uniq | |
/ | |
0#printing a list of samples from a VCF: | |
bcftools query -l test.vcf | |
*selecting snps from file: | |
~/bin/bcftools/bcftools view -v snps lc_bams.bcftools.20170319.NA12878.vcf.gz | |
/ | |
*selecting the variants from a VCF (excluding 0|0 genotypes) | |
bcftools view -c1 input.vcf | |
/ | |
*selecting the non-variants from a VCF(AC=0) | |
bcftools view -H -C0 concat.allchrs.sites.vcf.gz | |
/ | |
#filtering: | |
/ | |
#using one of the INFO annotations (IDV) | |
bcftools filter -sFilterName -e'IDV<5' input.vcf | |
/ | |
#OR logical operator: | |
bcftools filter -sFilterName -e'DP>50000 | IDV<9' input.vcf | |
/ | |
#filtering on FORMAT annotation: | |
bcftools filter -sFilterName -e'FORMAT/DP<5' input.vcf | |
/ | |
#filtering on INFO annotation: | |
bcftools filter -sFilterName -e'INFO/DP<5' input.vcf | |
/ | |
#printing out variants that pass the filter: | |
~/bin/bcftools/bcftools view -f.,PASS lc_bams.bcftools.20170411.exc.norm.SNP.filtered.vcf.gz | |
/ | |
#bcftools stats and filtering: | |
~/bin/bcftools/bcftools stats -f "PASS,." file.vcf | |
/ | |
#select only biallelic (excluding multiallelic) snps | |
bcftools view -m2 -M2 -v snps input.vcf.gz | |
/ | |
#select only the multiallelic snps | |
bcftools view -m3 -v snps input.vcf.gz | |
/ | |
#printing the set info in the INFO field: | |
bcftools view -i 'set="freebayes_lcex"' combined.all.chr20.vcf.gz | |
/ | |
#printing all entries having a quality <10 | |
bcftools view -i 'QUAL<10' in.vcf.gz | |
/ | |
#removing FORMAT column from the VCF | |
#it will remove all FORMAT annotations except the GT information | |
bcftools annotate -x FORMAT ifile.vcf.gz | |
/ | |
#removing INFO field from VCF | |
bcftools annotate --remove INFO in.vcf.gz | |
/ | |
#annotating a vcf file using the annotations from a different VCF (in this case we only annotate the INFO/DP) | |
bcftools annotate -c 'INFO/DP' -a annt.vcf.gz in.vcf.gz | |
/ | |
#annotating a vcf file with a tabular file: | |
see page https://github.com/samtools/bcftools/wiki/HOWTOs#annotate-from-bed | |
/ | |
#drop individual genotype information | |
bcftools view -G input.vcf.gz | |
/ | |
#getting stats on the number of REF/ALT swaps and other things: | |
bcftools +fixref file.bcf -- -f ref.fa | |
/ | |
#correcting the REF/ALT swaps: | |
bcftools norm --check-ref ws -f ref.fa in.vcf.gz -o out.vcf.gz -Oz | |
// | |
#changing the sample names in a VCF: | |
#the samplenames.txt file has the following format: | |
#oldsamplename newsamplename | |
bcftools reheader -s samplenames.txt NA12878.giab.SNP.chr20.non_valid.vcf.gz -o NA12878.giab.SNP.chr20.non_valid.reheaded.vcf.gz | |
// | |
#changing the header: | |
bcftools reheader -h newheader.txt ../combined.all.chr20.vcf.gz.snps.biallelic.vcf_chr20.vcf.gz.ensembl.vcf.gz.phased.ligated.NA12878.ucsc.vcf.gz -o combined.all.chr20.vcf.gz.snps.biallelic.vcf_chr20.vcf.gz.ensembl.vcf.gz.phased.ligated.NA12878.ucsc.reheaded.vcf.gz | |
// | |
# in order to use the plugins: | |
export BCFTOOLS_PLUGINS=~/bin/bcftools-1.6/plugins/ | |
// | |
# taqg2tag: | |
# convert PL to GL | |
bcftools +tag2tag in.vcf -- -r --pl-to-gl | |
// | |
# normalizing the multiallelic variants: | |
# with -any I will split the multiallelic variants (SNPs+INDELs) into several records | |
bcftools norm -m -any in.vcf.gz -o out.norm.vcf.gz -Oz | |
For example: | |
chr20 60280 . TTTCCA TTTCCATTCCA,T 744 PASS . | |
Will be converted to: | |
chr20 60280 . TTTCCA TTTCCATTCCA 744 PASS . | |
chr20 60280 . TTTCCA T 744 PASS . | |
// | |
#selecting the missing (uncalled) genotypes: | |
bcftools view -u in.vcf.gz -o missing_genotypes.vcf.gz -Oz | |
// | |
#select a particular genotype (0/1 or 1/1) from a vcf. In this case access sample accessed by index 8: | |
bcftools view -H combined.snps_indels_chr1.filt.vcf.gz.onlyvariants.vcf.gz.ensembl.vcf.gz.85706.vcf.gz -i 'GT[8]="het"' | |
// | |
#select all lines having exactly AC=2 | |
bcftools view -i'AC=2' in.vcf.gz | |
// | |
# if we have a tab in a VCF defined in the header like: | |
##INFO=<ID=GRCH37_38_REF_STRING_MATCH,Number=0,Type=Flag,Description="Indicates reference allele in origin GRCh37 vcf string-matches reference allele in dbsnp GRCh38 vcf"> | |
/ | |
#we can check for the records having this tag by doing: | |
bcftools view -H -i'GRCH37_38_REF_STRING_MATCH=1' ALL.chr7_GRCh38.genotypes.20170504.ensembl.vcf.NA12878.biallelic.nonvariants.nonvalid.snps.vcf.gz |less | |
/ | |
#And the contrary by doing: | |
bcftools view -H -i'GRCH37_38_REF_STRING_MATCH=0' ALL.chr7_GRCh38.genotypes.20170504.ensembl.vcf.NA12878.biallelic.nonvariants.nonvalid.snps.vcf.gz |less | |
/ | |
# Filtering a VCF depending on a certain Allele frequency: | |
bcftools view -i 'INFO/AF > 0.8' z.vcf.gz |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment