brentp · November 4, 2013 20:04
diff --git a/kin.R b/kin.R
 library(SNPRelate)
 library(FDb.UCSC.snp137common.hg19)
 library(hash)

 args = commandArgs(TRUE)

 vcf.fn = args[1]


 gds.fn = paste0(args[1], ".gds")
 message(gds.fn)
 genofile = snpgdsVCF2GDS(vcf.fn, gds.fn, method="biallelic.only", snpfirstdim=TRUE)
 genofile <- openfn.gds(gds.fn)

 sample.ids <- read.gdsn(index.gdsn(genofile, "sample.id"))
 snp.id = read.gdsn(index.gdsn(genofile, "snp.id"))
 chroms = read.gdsn(index.gdsn(genofile, "snp.chromosome"))
 posns = read.gdsn(index.gdsn(genofile, "snp.position"))

 gr = GRanges(seqnames=chroms, ranges=IRanges(posns, posns))
 locs = paste(chroms, posns, sep=":")

 ibd <- snpgdsIBDMoM(genofile,
                    sample.id=sample.ids , snp.id=snp.id,
                    maf=0.02,
                    kinship=TRUE,
                    missing.rate=0.05)

 snp137Common = features(FDb.UCSC.snp137common.hg19)
 snp137Common = renameSeqlevels(snp137Common, gsub("^chr", "", seqlevels(snp137Common)))
 # remove any in snp137Common
 common = overlapsAny(gr, snp137Common)


 ibd.robust <- snpgdsIBDKING(genofile, maf=0.001, missing.rate=0.05)
 ibs = snpgdsIBDSelection(ibd.robust, kinship.cutoff=1/16)

 # a and b are indicators for the presence of a rare allele.
 # kin.coef is their kinship coef. population.af is the allele
 # freq from a database or from controls
 # can require both members to have the variant.
 k.eff = function(a, b, kin.coef, population.af, require.both=FALSE){
    # require both to carry the allele to do the complex calc.
    a = as.integer(a != 0)
    b = as.integer(b != 0)
    # see AJHG 89 pg 703.
    if(0 %in% c(a, b)) return(ifelse(require.both, 0, a + b)) 
    
    # if closer than first cousins (1/16), delta is 1
    delta = ifelse(kin.coef >= 1/24, 1, 0)
    f = population.af

    # straight from ionita-laza 
    k = log(
        (4 * f * kin.coef) +
        (4 * f^2) *
        (1 - 4 * kin.coef + 4 * delta * kin.coef ^2),
        base=2*f)

    if(!(k >= 1)){ stop(paste(k, a, b, kin.coef, population.af)) }
    stopifnot(k >= 1)
    k
 }

 k.eff.sum = function(ibs, genotypes, require.both=FALSE){
    if(sum(genotypes > 0) > 0.1 * length(genotypes)){ return(NA)}
    if(sum(genotypes > 0) < 3){ return(NA)}
    s = 0
    seen = hash()
    for(irow in 1:nrow(ibs)){
        row = ibs[irow,]
        seen.count = sum(!is.null(seen[[row$ID1]]), !is.null(seen[[row$ID2]]))
        # we've already checked both of these individuals
        if(seen.count == 2){ next }
        g1 = genotypes[[row$ID1]]
        g2 = genotypes[[row$ID2]]
        if(g1 * g2 == 0){ next }
        seen[[row$ID1]] = TRUE
        seen[[row$ID2]] = TRUE

        #k=k.eff(g1, g2, row$kinship, sum(genotypes)/length(genotypes), require.both)
        # NOTE: using fixed frequency here!!
        k=k.eff(g1, g2, row$kinship, 0.007, require.both)

        s = s + k - seen.count

    }
    # need at least some indication of a family effect
    #if(s == 0){ return(0) }
    # add the singletons
    for(name in names(genotypes)){
        if(is.null(seen[[name]])){
            s = s + (genotypes[[name]] > 0)
        }
    }
    s
 }

 rgenotypes = snpgdsGetGeno(genofile, snp.id=snp.id, snpfirstdim=TRUE)
 #rownames(rgenotypes) = snp.id
 colnames(rgenotypes) = sample.ids
 rgenotypes = rgenotypes[!common,]

 ulocs = locs[!common]

 r2 = rgenotypes
 r2[!is.element(r2, c(0,1,2))] <- NA
 af = rowMeans(r2, na.rm=TRUE) / 2



 fams = union(ibs$ID1, ibs$ID2)


 # guess a conservative sample-size as this goes lower we are more likely to see significance
 sample_size = ncol(rgenotypes) # - length(fams) / 5
 message(paste0('sample-size:', sample_size))

 write(sprintf("chrom\tstart\tend\tp\tprop_w_alt\tsum(g)\tk.eff.sum\taf"), stdout())
 for(isnp in 1:nrow(rgenotypes)){
    cpos = strsplit(ulocs[isnp], ":", fixed=TRUE)[[1]]
    chrom = cpos[1]
    if(chrom == "23"){ next }
    position = as.integer(cpos[2]); var.counts = NULL

    # uses 3 for missing data.
    bad = sum(c(rgenotypes[isnp,]) == 3)
    if(bad > 10){ next }
    if(af[isnp] < 0.5){
        var.counts = as.integer(c(rgenotypes[isnp,]) > 0 & c(rgenotypes[isnp,]) != 3)
    } else {
        # alleles are switched so 0,1 is rare state
        var.counts = as.integer(c(rgenotypes[isnp,]) < 2)
    }
    # never seen
    if(sum(var.counts) == 0){ next }
    #if(sum(var.counts) < 4){ next }
    # very common
    if(mean(var.counts) > 0.2){ next }
    names(var.counts) = sample.ids

    # TODO: get AF from dbsnp.
    population.af=sum(var.counts) / length(var.counts)
    # at most, 1 out of 100 people have the variant. in the general pop.
    population.freq = 0.02 # this should give conservative values for result.

    s = k.eff.sum(ibs, var.counts, require.both=TRUE)
    if(!is.na(s)){
        # no family effect
        #if(s == sum(var.counts)){ next }
        # how to set # trials.
        pv = pbinom(s - 1, sample_size, population.freq, lower.tail=FALSE)
        if(pv > 1e-6){ next }
        write(sprintf("chr%s\t%i\t%i\t%.4g\t%.3f\t%d\t%.2f\t%.3f", 
                     chrom, position - 1, position,
                     pv, sum(var.counts)/length(var.counts),
                      sum(var.counts), s,
                      min(af[isnp], 1-af[isnp])), stdout())
        }
 }
	library(SNPRelate)
	library(FDb.UCSC.snp137common.hg19)
	library(hash)

	args = commandArgs(TRUE)

	vcf.fn = args[1]


	gds.fn = paste0(args[1], ".gds")
	message(gds.fn)
	genofile = snpgdsVCF2GDS(vcf.fn, gds.fn, method="biallelic.only", snpfirstdim=TRUE)
	genofile <- openfn.gds(gds.fn)

	sample.ids <- read.gdsn(index.gdsn(genofile, "sample.id"))
	snp.id = read.gdsn(index.gdsn(genofile, "snp.id"))
	chroms = read.gdsn(index.gdsn(genofile, "snp.chromosome"))
	posns = read.gdsn(index.gdsn(genofile, "snp.position"))

	gr = GRanges(seqnames=chroms, ranges=IRanges(posns, posns))
	locs = paste(chroms, posns, sep=":")

	ibd <- snpgdsIBDMoM(genofile,
	sample.id=sample.ids , snp.id=snp.id,
	maf=0.02,
	kinship=TRUE,
	missing.rate=0.05)

	snp137Common = features(FDb.UCSC.snp137common.hg19)
	snp137Common = renameSeqlevels(snp137Common, gsub("^chr", "", seqlevels(snp137Common)))
	# remove any in snp137Common
	common = overlapsAny(gr, snp137Common)


	ibd.robust <- snpgdsIBDKING(genofile, maf=0.001, missing.rate=0.05)
	ibs = snpgdsIBDSelection(ibd.robust, kinship.cutoff=1/16)

	# a and b are indicators for the presence of a rare allele.
	# kin.coef is their kinship coef. population.af is the allele
	# freq from a database or from controls
	# can require both members to have the variant.
	k.eff = function(a, b, kin.coef, population.af, require.both=FALSE){
	# require both to carry the allele to do the complex calc.
	a = as.integer(a != 0)
	b = as.integer(b != 0)
	# see AJHG 89 pg 703.
	if(0 %in% c(a, b)) return(ifelse(require.both, 0, a + b))

	# if closer than first cousins (1/16), delta is 1
	delta = ifelse(kin.coef >= 1/24, 1, 0)
	f = population.af

	# straight from ionita-laza
	k = log(
	(4 * f * kin.coef) +
	(4 * f^2) *
	(1 - 4 * kin.coef + 4 * delta * kin.coef ^2),
	base=2*f)

	if(!(k >= 1)){ stop(paste(k, a, b, kin.coef, population.af)) }
	stopifnot(k >= 1)
	k
	}

	k.eff.sum = function(ibs, genotypes, require.both=FALSE){
	if(sum(genotypes > 0) > 0.1 * length(genotypes)){ return(NA)}
	if(sum(genotypes > 0) < 3){ return(NA)}
	s = 0
	seen = hash()
	for(irow in 1:nrow(ibs)){
	row = ibs[irow,]
	seen.count = sum(!is.null(seen[[row$ID1]]), !is.null(seen[[row$ID2]]))
	# we've already checked both of these individuals
	if(seen.count == 2){ next }
	g1 = genotypes[[row$ID1]]
	g2 = genotypes[[row$ID2]]
	if(g1 * g2 == 0){ next }
	seen[[row$ID1]] = TRUE
	seen[[row$ID2]] = TRUE

	#k=k.eff(g1, g2, row$kinship, sum(genotypes)/length(genotypes), require.both)
	# NOTE: using fixed frequency here!!
	k=k.eff(g1, g2, row$kinship, 0.007, require.both)

	s = s + k - seen.count

	}
	# need at least some indication of a family effect
	#if(s == 0){ return(0) }
	# add the singletons
	for(name in names(genotypes)){
	if(is.null(seen[[name]])){
	s = s + (genotypes[[name]] > 0)
	}
	}
	s
	}

	rgenotypes = snpgdsGetGeno(genofile, snp.id=snp.id, snpfirstdim=TRUE)
	#rownames(rgenotypes) = snp.id
	colnames(rgenotypes) = sample.ids
	rgenotypes = rgenotypes[!common,]

	ulocs = locs[!common]

	r2 = rgenotypes
	r2[!is.element(r2, c(0,1,2))] <- NA
	af = rowMeans(r2, na.rm=TRUE) / 2



	fams = union(ibs$ID1, ibs$ID2)


	# guess a conservative sample-size as this goes lower we are more likely to see significance
	sample_size = ncol(rgenotypes) # - length(fams) / 5
	message(paste0('sample-size:', sample_size))

	write(sprintf("chrom\tstart\tend\tp\tprop_w_alt\tsum(g)\tk.eff.sum\taf"), stdout())
	for(isnp in 1:nrow(rgenotypes)){
	cpos = strsplit(ulocs[isnp], ":", fixed=TRUE)[[1]]
	chrom = cpos[1]
	if(chrom == "23"){ next }
	position = as.integer(cpos[2]); var.counts = NULL

	# uses 3 for missing data.
	bad = sum(c(rgenotypes[isnp,]) == 3)
	if(bad > 10){ next }
	if(af[isnp] < 0.5){
	var.counts = as.integer(c(rgenotypes[isnp,]) > 0 & c(rgenotypes[isnp,]) != 3)
	} else {
	# alleles are switched so 0,1 is rare state
	var.counts = as.integer(c(rgenotypes[isnp,]) < 2)
	}
	# never seen
	if(sum(var.counts) == 0){ next }
	#if(sum(var.counts) < 4){ next }
	# very common
	if(mean(var.counts) > 0.2){ next }
	names(var.counts) = sample.ids

	# TODO: get AF from dbsnp.
	population.af=sum(var.counts) / length(var.counts)
	# at most, 1 out of 100 people have the variant. in the general pop.
	population.freq = 0.02 # this should give conservative values for result.

	s = k.eff.sum(ibs, var.counts, require.both=TRUE)
	if(!is.na(s)){
	# no family effect
	#if(s == sum(var.counts)){ next }
	# how to set # trials.
	pv = pbinom(s - 1, sample_size, population.freq, lower.tail=FALSE)
	if(pv > 1e-6){ next }
	write(sprintf("chr%s\t%i\t%i\t%.4g\t%.3f\t%d\t%.2f\t%.3f",
	chrom, position - 1, position,
	pv, sum(var.counts)/length(var.counts),
	sum(var.counts), s,
	min(af[isnp], 1-af[isnp])), stdout())
	}
	}