Skip to content

Instantly share code, notes, and snippets.

@brentp
Created November 4, 2013 20:04
Show Gist options
  • Save brentp/7308421 to your computer and use it in GitHub Desktop.
Save brentp/7308421 to your computer and use it in GitHub Desktop.
calculate probability of single rare snps using a binomial test and adjusting observed counts such that 2 members from the same family will contribute a number less that 2, depending on their relatedness. This allows us to use family information, but not to over-count rare-alleles due to their presence in families.
library(SNPRelate)
library(FDb.UCSC.snp137common.hg19)
library(hash)
args = commandArgs(TRUE)
vcf.fn = args[1]
gds.fn = paste0(args[1], ".gds")
message(gds.fn)
genofile = snpgdsVCF2GDS(vcf.fn, gds.fn, method="biallelic.only", snpfirstdim=TRUE)
genofile <- openfn.gds(gds.fn)
sample.ids <- read.gdsn(index.gdsn(genofile, "sample.id"))
snp.id = read.gdsn(index.gdsn(genofile, "snp.id"))
chroms = read.gdsn(index.gdsn(genofile, "snp.chromosome"))
posns = read.gdsn(index.gdsn(genofile, "snp.position"))
gr = GRanges(seqnames=chroms, ranges=IRanges(posns, posns))
locs = paste(chroms, posns, sep=":")
ibd <- snpgdsIBDMoM(genofile,
sample.id=sample.ids , snp.id=snp.id,
maf=0.02,
kinship=TRUE,
missing.rate=0.05)
snp137Common = features(FDb.UCSC.snp137common.hg19)
snp137Common = renameSeqlevels(snp137Common, gsub("^chr", "", seqlevels(snp137Common)))
# remove any in snp137Common
common = overlapsAny(gr, snp137Common)
ibd.robust <- snpgdsIBDKING(genofile, maf=0.001, missing.rate=0.05)
ibs = snpgdsIBDSelection(ibd.robust, kinship.cutoff=1/16)
# a and b are indicators for the presence of a rare allele.
# kin.coef is their kinship coef. population.af is the allele
# freq from a database or from controls
# can require both members to have the variant.
k.eff = function(a, b, kin.coef, population.af, require.both=FALSE){
# require both to carry the allele to do the complex calc.
a = as.integer(a != 0)
b = as.integer(b != 0)
# see AJHG 89 pg 703.
if(0 %in% c(a, b)) return(ifelse(require.both, 0, a + b))
# if closer than first cousins (1/16), delta is 1
delta = ifelse(kin.coef >= 1/24, 1, 0)
f = population.af
# straight from ionita-laza
k = log(
(4 * f * kin.coef) +
(4 * f^2) *
(1 - 4 * kin.coef + 4 * delta * kin.coef ^2),
base=2*f)
if(!(k >= 1)){ stop(paste(k, a, b, kin.coef, population.af)) }
stopifnot(k >= 1)
k
}
k.eff.sum = function(ibs, genotypes, require.both=FALSE){
if(sum(genotypes > 0) > 0.1 * length(genotypes)){ return(NA)}
if(sum(genotypes > 0) < 3){ return(NA)}
s = 0
seen = hash()
for(irow in 1:nrow(ibs)){
row = ibs[irow,]
seen.count = sum(!is.null(seen[[row$ID1]]), !is.null(seen[[row$ID2]]))
# we've already checked both of these individuals
if(seen.count == 2){ next }
g1 = genotypes[[row$ID1]]
g2 = genotypes[[row$ID2]]
if(g1 * g2 == 0){ next }
seen[[row$ID1]] = TRUE
seen[[row$ID2]] = TRUE
#k=k.eff(g1, g2, row$kinship, sum(genotypes)/length(genotypes), require.both)
# NOTE: using fixed frequency here!!
k=k.eff(g1, g2, row$kinship, 0.007, require.both)
s = s + k - seen.count
}
# need at least some indication of a family effect
#if(s == 0){ return(0) }
# add the singletons
for(name in names(genotypes)){
if(is.null(seen[[name]])){
s = s + (genotypes[[name]] > 0)
}
}
s
}
rgenotypes = snpgdsGetGeno(genofile, snp.id=snp.id, snpfirstdim=TRUE)
#rownames(rgenotypes) = snp.id
colnames(rgenotypes) = sample.ids
rgenotypes = rgenotypes[!common,]
ulocs = locs[!common]
r2 = rgenotypes
r2[!is.element(r2, c(0,1,2))] <- NA
af = rowMeans(r2, na.rm=TRUE) / 2
fams = union(ibs$ID1, ibs$ID2)
# guess a conservative sample-size as this goes lower we are more likely to see significance
sample_size = ncol(rgenotypes) # - length(fams) / 5
message(paste0('sample-size:', sample_size))
write(sprintf("chrom\tstart\tend\tp\tprop_w_alt\tsum(g)\tk.eff.sum\taf"), stdout())
for(isnp in 1:nrow(rgenotypes)){
cpos = strsplit(ulocs[isnp], ":", fixed=TRUE)[[1]]
chrom = cpos[1]
if(chrom == "23"){ next }
position = as.integer(cpos[2]); var.counts = NULL
# uses 3 for missing data.
bad = sum(c(rgenotypes[isnp,]) == 3)
if(bad > 10){ next }
if(af[isnp] < 0.5){
var.counts = as.integer(c(rgenotypes[isnp,]) > 0 & c(rgenotypes[isnp,]) != 3)
} else {
# alleles are switched so 0,1 is rare state
var.counts = as.integer(c(rgenotypes[isnp,]) < 2)
}
# never seen
if(sum(var.counts) == 0){ next }
#if(sum(var.counts) < 4){ next }
# very common
if(mean(var.counts) > 0.2){ next }
names(var.counts) = sample.ids
# TODO: get AF from dbsnp.
population.af=sum(var.counts) / length(var.counts)
# at most, 1 out of 100 people have the variant. in the general pop.
population.freq = 0.02 # this should give conservative values for result.
s = k.eff.sum(ibs, var.counts, require.both=TRUE)
if(!is.na(s)){
# no family effect
#if(s == sum(var.counts)){ next }
# how to set # trials.
pv = pbinom(s - 1, sample_size, population.freq, lower.tail=FALSE)
if(pv > 1e-6){ next }
write(sprintf("chr%s\t%i\t%i\t%.4g\t%.3f\t%d\t%.2f\t%.3f",
chrom, position - 1, position,
pv, sum(var.counts)/length(var.counts),
sum(var.counts), s,
min(af[isnp], 1-af[isnp])), stdout())
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment