Created
November 4, 2013 20:04
-
-
Save brentp/7308421 to your computer and use it in GitHub Desktop.
calculate probability of single rare snps using a binomial test and adjusting observed counts such that 2 members from the same family will contribute a number less that 2, depending on their relatedness. This allows us to use family information, but not to over-count rare-alleles due to their presence in families.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(SNPRelate) | |
library(FDb.UCSC.snp137common.hg19) | |
library(hash) | |
args = commandArgs(TRUE) | |
vcf.fn = args[1] | |
gds.fn = paste0(args[1], ".gds") | |
message(gds.fn) | |
genofile = snpgdsVCF2GDS(vcf.fn, gds.fn, method="biallelic.only", snpfirstdim=TRUE) | |
genofile <- openfn.gds(gds.fn) | |
sample.ids <- read.gdsn(index.gdsn(genofile, "sample.id")) | |
snp.id = read.gdsn(index.gdsn(genofile, "snp.id")) | |
chroms = read.gdsn(index.gdsn(genofile, "snp.chromosome")) | |
posns = read.gdsn(index.gdsn(genofile, "snp.position")) | |
gr = GRanges(seqnames=chroms, ranges=IRanges(posns, posns)) | |
locs = paste(chroms, posns, sep=":") | |
ibd <- snpgdsIBDMoM(genofile, | |
sample.id=sample.ids , snp.id=snp.id, | |
maf=0.02, | |
kinship=TRUE, | |
missing.rate=0.05) | |
snp137Common = features(FDb.UCSC.snp137common.hg19) | |
snp137Common = renameSeqlevels(snp137Common, gsub("^chr", "", seqlevels(snp137Common))) | |
# remove any in snp137Common | |
common = overlapsAny(gr, snp137Common) | |
ibd.robust <- snpgdsIBDKING(genofile, maf=0.001, missing.rate=0.05) | |
ibs = snpgdsIBDSelection(ibd.robust, kinship.cutoff=1/16) | |
# a and b are indicators for the presence of a rare allele. | |
# kin.coef is their kinship coef. population.af is the allele | |
# freq from a database or from controls | |
# can require both members to have the variant. | |
k.eff = function(a, b, kin.coef, population.af, require.both=FALSE){ | |
# require both to carry the allele to do the complex calc. | |
a = as.integer(a != 0) | |
b = as.integer(b != 0) | |
# see AJHG 89 pg 703. | |
if(0 %in% c(a, b)) return(ifelse(require.both, 0, a + b)) | |
# if closer than first cousins (1/16), delta is 1 | |
delta = ifelse(kin.coef >= 1/24, 1, 0) | |
f = population.af | |
# straight from ionita-laza | |
k = log( | |
(4 * f * kin.coef) + | |
(4 * f^2) * | |
(1 - 4 * kin.coef + 4 * delta * kin.coef ^2), | |
base=2*f) | |
if(!(k >= 1)){ stop(paste(k, a, b, kin.coef, population.af)) } | |
stopifnot(k >= 1) | |
k | |
} | |
k.eff.sum = function(ibs, genotypes, require.both=FALSE){ | |
if(sum(genotypes > 0) > 0.1 * length(genotypes)){ return(NA)} | |
if(sum(genotypes > 0) < 3){ return(NA)} | |
s = 0 | |
seen = hash() | |
for(irow in 1:nrow(ibs)){ | |
row = ibs[irow,] | |
seen.count = sum(!is.null(seen[[row$ID1]]), !is.null(seen[[row$ID2]])) | |
# we've already checked both of these individuals | |
if(seen.count == 2){ next } | |
g1 = genotypes[[row$ID1]] | |
g2 = genotypes[[row$ID2]] | |
if(g1 * g2 == 0){ next } | |
seen[[row$ID1]] = TRUE | |
seen[[row$ID2]] = TRUE | |
#k=k.eff(g1, g2, row$kinship, sum(genotypes)/length(genotypes), require.both) | |
# NOTE: using fixed frequency here!! | |
k=k.eff(g1, g2, row$kinship, 0.007, require.both) | |
s = s + k - seen.count | |
} | |
# need at least some indication of a family effect | |
#if(s == 0){ return(0) } | |
# add the singletons | |
for(name in names(genotypes)){ | |
if(is.null(seen[[name]])){ | |
s = s + (genotypes[[name]] > 0) | |
} | |
} | |
s | |
} | |
rgenotypes = snpgdsGetGeno(genofile, snp.id=snp.id, snpfirstdim=TRUE) | |
#rownames(rgenotypes) = snp.id | |
colnames(rgenotypes) = sample.ids | |
rgenotypes = rgenotypes[!common,] | |
ulocs = locs[!common] | |
r2 = rgenotypes | |
r2[!is.element(r2, c(0,1,2))] <- NA | |
af = rowMeans(r2, na.rm=TRUE) / 2 | |
fams = union(ibs$ID1, ibs$ID2) | |
# guess a conservative sample-size as this goes lower we are more likely to see significance | |
sample_size = ncol(rgenotypes) # - length(fams) / 5 | |
message(paste0('sample-size:', sample_size)) | |
write(sprintf("chrom\tstart\tend\tp\tprop_w_alt\tsum(g)\tk.eff.sum\taf"), stdout()) | |
for(isnp in 1:nrow(rgenotypes)){ | |
cpos = strsplit(ulocs[isnp], ":", fixed=TRUE)[[1]] | |
chrom = cpos[1] | |
if(chrom == "23"){ next } | |
position = as.integer(cpos[2]); var.counts = NULL | |
# uses 3 for missing data. | |
bad = sum(c(rgenotypes[isnp,]) == 3) | |
if(bad > 10){ next } | |
if(af[isnp] < 0.5){ | |
var.counts = as.integer(c(rgenotypes[isnp,]) > 0 & c(rgenotypes[isnp,]) != 3) | |
} else { | |
# alleles are switched so 0,1 is rare state | |
var.counts = as.integer(c(rgenotypes[isnp,]) < 2) | |
} | |
# never seen | |
if(sum(var.counts) == 0){ next } | |
#if(sum(var.counts) < 4){ next } | |
# very common | |
if(mean(var.counts) > 0.2){ next } | |
names(var.counts) = sample.ids | |
# TODO: get AF from dbsnp. | |
population.af=sum(var.counts) / length(var.counts) | |
# at most, 1 out of 100 people have the variant. in the general pop. | |
population.freq = 0.02 # this should give conservative values for result. | |
s = k.eff.sum(ibs, var.counts, require.both=TRUE) | |
if(!is.na(s)){ | |
# no family effect | |
#if(s == sum(var.counts)){ next } | |
# how to set # trials. | |
pv = pbinom(s - 1, sample_size, population.freq, lower.tail=FALSE) | |
if(pv > 1e-6){ next } | |
write(sprintf("chr%s\t%i\t%i\t%.4g\t%.3f\t%d\t%.2f\t%.3f", | |
chrom, position - 1, position, | |
pv, sum(var.counts)/length(var.counts), | |
sum(var.counts), s, | |
min(af[isnp], 1-af[isnp])), stdout()) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment