Last active
August 10, 2022 14:53
-
-
Save vjcitn/5953ea4531805b61b58c44ad96c010ec to your computer and use it in GitHub Desktop.
use VariantAnnotation::readVcf etc. to get allelic depth count matrices from VCF
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' produce matrices of AD values from VcfFile instance for a given genomic interval | |
#' @param vf instance of VcfFile, should be tabix indexed | |
#' @param rng compatible GRanges instance | |
#' @param genome character(1) obligatory for readVcf | |
#' @return list with matrices allele1 and allele2, similar to the AD matrix, and ref and alt as obtained directly | |
#' @examples | |
#' x = VariantAnnotation::VcfFile("ALL.chrX.BI_Beagle.20100804.genotypes.vcf.gz") | |
#' param = GRanges("X", IRanges(60000, width=10000)) | |
#' m = ad_mats(x, param) | |
#' m$allele1[1:4,1:10] | |
#' m$allele2[1:4,1:10] | |
# | |
# m$allele1[1:4,1:10] | |
# HG00098 HG00100 HG00106 HG00112 HG00114 HG00116 HG00117 HG00118 | |
# X:60009_A/C 1 4 2 NA 1 3 7 NA | |
# X:60010_A/G 1 4 3 NA 3 3 7 NA | |
# X:60015_A/C 1 4 3 NA NA 3 7 NA | |
# X:60021_A/C 0 4 4 1 NA 4 7 NA | |
# HG00119 HG00120 | |
# X:60009_A/C NA 11 | |
# X:60010_A/G NA 12 | |
# X:60015_A/C NA 13 | |
# X:60021_A/C NA 14 | |
# | |
# > m$allele2[1:4,1:10] | |
# HG00098 HG00100 HG00106 HG00112 HG00114 HG00116 HG00117 HG00118 | |
# X:60009_A/C 0 0 1 NA 0 0 0 NA | |
# X:60010_A/G 0 0 0 NA 0 0 0 NA | |
# X:60015_A/C 0 0 0 NA NA 0 0 NA | |
# X:60021_A/C 0 0 0 1 NA 0 0 NA | |
# HG00119 HG00120 | |
# X:60009_A/C NA 0 | |
# X:60010_A/G NA 0 | |
# X:60015_A/C NA 0 | |
# X:60021_A/C NA 1 | |
# | |
#' @export | |
ad_mats = function(vf, rng, genome="hg19") { | |
z = readVcf(vf, genome=genome, param=rng) # import a slice | |
az = alt(z) | |
rz = as.character(ref(z)) # ref has length 1 | |
# | |
# build functions that will extract AD fields | |
# | |
null2na1 = function(x) ifelse(length(x[1])==0,NA,x[1]) | |
null2na2 = function(x) ifelse(length(x[2])==0,NA,x[2]) | |
# | |
# get counts for alleles 1 and 2 -- it is the '0-length' matrix elements that are problematic | |
# | |
ad = geno(z)$AD # should be a matrix with i,j element either a 2-vector of integer or a 0-vector of integer | |
a1 = t(apply(ad,1,sapply,null2na1)) | |
a2 = t(apply(ad,1,sapply,null2na2)) | |
dimnames(a1) = dimnames(a2) = dimnames(ad) | |
list(allele1=a1, allele2=a2, ref=rz, alt=az) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment