Skip to content

Instantly share code, notes, and snippets.

@sashagusev
Last active September 18, 2024 21:20
Show Gist options
  • Save sashagusev/a217b5ad13208d045a9d7d7c040b7593 to your computer and use it in GitHub Desktop.
Save sashagusev/a217b5ad13208d045a9d7d7c040b7593 to your computer and use it in GitHub Desktop.
set.seed(42)
# Library for GLMM computations
library(PQLseq2)
# Library for mixed model / Wald test
library("GMMAT")
# convenience function to suppress prints
quiet <- function(x) {
sink(tempfile())
on.exit(sink())
invisible(force(x))
}
RUN_LMM = TRUE
# We will simulate two populations with drift
# Simulation for the past 10k years / 400 generations
gens = as.integer(10e3/25)
# Use neutral SNPs for the GRM calculation
M = 1e3
# No. of SNPs to test under selection
M_selected = 50
# 100 individuals in each population
samp_0 = 100
samp_1 = 100
# ---
# Simulate different Nes for different levels of drift
par(mfrow=c(1,3))
for ( Ne in c(1e3,10e3,1e6) ) {
cat("\n\nSimulating Ne:",Ne,'\n')
# First population
p0 = runif(M,0.1,0.9)
# Compute drift variance
drift_var = p0 * (1-p0) * (gens / (2*Ne))
# Second population
p1 = p0 + rnorm(M,0,sqrt(drift_var))
p1[ p1 < 0 ] = 0
p1[ p1 > 1 ] = 1
# Matrices for the null SNPs
snp0 = matrix(NA,nrow=samp_0,ncol=M)
snp1 = matrix(NA,nrow=samp_1,ncol=M)
for ( i in 1:M ) {
# generate samples from the two populations
snp0[,i] = rbinom(samp_0,2,p0[i])
snp1[,i] = rbinom(samp_1,2,p1[i])
}
# center and scale the genotypes
# genos = scale(rbind(snp0,snp1))
# center and scale using frequencies (as in paper)
genos = rbind(snp0,snp1)
mafs = apply(genos,2,mean)/2
for ( i in 1:M ) {
genos[,i] = (genos[,i] - 2*mafs[i])/sqrt(2*mafs[i]*(1-mafs[i]))
}
# compute the GRM
grm = genos %*% t(genos) / M
# add a small ridge to make this matrix non-singular
grm = 0.9 * grm + 0.1 * diag(nrow(grm))
# As a sanity check, compute PCs
eigenvec = prcomp( grm )$rotation
# check correlation with pop label
pop_lbl = c(rep(0,samp_0),rep(1,samp_1))
cat("Correlation w/ PC1",cor(eigenvec[,1],pop_lbl),'\n')
# --- Now generate new SNPs under selection and test them
# Selection coefficients
for ( s in c(0,0.002) ) {
cat("---\ns =" , s , '\n')
p0 = rep(NA,M_selected)
p1 = rep(NA,M_selected)
chisq_waples = rep(NA,M_selected)
z_lm = rep(NA,M_selected)
z_glmm = rep(NA,M_selected)
# test for association with number of generations apart (in this case it is just population label times number of generations)
test_population_time = pop_lbl*gens
# let's add some noise to the dates make it more realistic
test_population_time = test_population_time + rnorm(length(test_population_time),0,sqrt(0.1*var(test_population_time)))
selected_snps = matrix(NA,nrow=samp_0+samp_1,ncol=M_selected)
for ( i in 1:M_selected ) {
# compute target_maf for this s iteratively (there has to be a derivation for this)
p0[i] = runif(1,0.1,0.9)
p_trajectory = rep(NA,gens)
p_trajectory[1] = p0[i]
for ( g in 2:gens ) {
p_trajectory[g] = p_trajectory[g-1] + s*p_trajectory[g-1]*(1-p_trajectory[g-1])/(1+2*s*p_trajectory[g-1])
}
p1[i] = tail(p_trajectory,1)
# add drift
drift_var = p0[i] * (1-p0[i]) * (gens / (2*Ne))
# Second population
p1[i] = p1[i] + rnorm(1,0,sqrt(drift_var))
if(p1[i] > 1) p1[i] = 1
if(p1[i] < 0) p1[i] = 0
# Sample the variants
cur_snp0 = rbinom(samp_0,2,p0[i])
cur_snp1 = rbinom(samp_1,2,p1[i])
test_snp_alleles = matrix(c(cur_snp0,cur_snp1),nrow=1)
selected_snps[,i] = test_snp_alleles
rownames(test_snp_alleles) = "snp1"
test_total_alleles = matrix(rep(2,length(test_snp_alleles)),nrow=1)
# test with a standard LM (should be inflated by drift)
reg = summary(lm( c(test_population_time) ~ c(test_snp_alleles) ))
z_lm[i] = reg$coef[2,3]
# test with a GLMM
# hide outputs
reg = quiet(pqlseq2( Y=test_snp_alleles , x=test_population_time , K=grm , lib_size = test_total_alleles , filter=F , verbose=F , model="BMM" , check_K=T , fix_h2eq1=T ))
# Same run but with the original pqlseq
# reg = quiet(pqlseq( RawCountDataSet=test_snp_alleles , Phenotypes=test_population_time , RelatednessMatrix=grm , LibSize = test_total_alleles , filtering=F , fit.model="BMM" ))
if ( is.na(reg$h2) || reg$h2 == 0 || reg$converged == FALSE ) {
z_glmm[i] = NA
} else {
z_glmm[i] = reg$beta / reg$se_beta
}
# compute the standard Waples statistic (assuming we know Ne and starting frequency perfectly)
cur_p0 = mean(cur_snp0/2)
cur_p1 = mean(cur_snp1/2)
waples_drift_var = p0[i] * (1-p0[i]) * ( 1/(2*samp_1) + (gens / (2*Ne))*(1-1/(2*samp_1)) )
chisq_waples[i] = (cur_p1 - cur_p0)^2 / ( waples_drift_var )
}
if ( s == 0 ) {
plot(p0,p1,xlim=c(0,1),ylim=c(0,1),main=paste("Ne = ",Ne,sep=''),las=1)
abline(0,1,lty=3)
} else {
points(p0,p1,col="red")
}
if ( RUN_LMM ) {
# Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it
rownames(selected_snps) = paste("sample_",1:nrow(selected_snps),sep='')
colnames(selected_snps) = paste("snp_",1:ncol(selected_snps),sep='')
write.table( t(selected_snps) , row.names=T, col.names=F, sep='\t' , quote=F , file="tmp.geno.txt")
gmmat_df = data.frame("disease" = test_population_time , "id" = rownames(selected_snps) )
rownames(grm) = rownames(selected_snps)
colnames(grm) = rownames(selected_snps)
# Run GMMAT analysis
reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , is.dosage=TRUE, snps=colnames(selected_snps) , verbose=FALSE )
z_lmm = reg$BETA / reg$SE
} else {
z_lmm = NA
}
cat("GLMM NaN Fail",mean(is.na(z_glmm)),'\n')
cat("Power Waples",mean(chisq_waples > 1.96^2),'\n')
cat("Power LM",mean(z_lm^2 > 1.96^2),'\n')
cat("Power GLMM",mean(z_glmm^2 > 1.96^2,na.rm=T),'\n')
cat("Power GLMM (no NAs)",sum(z_glmm^2 > 1.96,na.rm=T)/length(z_glmm),'\n')
cat("Power LMM",mean(z_lmm^2 > 1.96^2),'\n')
}
}
# Expected Output
#
#
# Simulating Ne: 1000
# Correlation w/ PC1 0.9959432
# ---
# s = 0
# GLMM NaN Fail 0.38
# Power Waples 0.02
# Power LM 0.58
# Power GLMM 0.5483871
# Power GLMM (no NAs) 0.4
# Power LMM 0.04
# ---
# s = 0.002
# GLMM NaN Fail 0.36
# Power Waples 0.12
# Power LM 0.84
# Power GLMM 0.71875
# Power GLMM (no NAs) 0.5
# Power LMM 0.12
#
#
# Simulating Ne: 10000
# Correlation w/ PC1 0.929122
# ---
# s = 0
# GLMM NaN Fail 0.56
# Power Waples 0.08
# Power LM 0.28
# Power GLMM 0.2272727
# Power GLMM (no NAs) 0.2
# Power LMM 0.08
# ---
# s = 0.002
# GLMM NaN Fail 0.52
# Power Waples 0.6
# Power LM 0.78
# Power GLMM 0.7916667
# Power GLMM (no NAs) 0.4
# Power LMM 0.54
#
#
# Simulating Ne: 1e+06
# Correlation w/ PC1 -0.02720476
# ---
# s = 0
# GLMM NaN Fail 0.5
# Power Waples 0.1
# Power LM 0.06
# Power GLMM 0.04
# Power GLMM (no NAs) 0.04
# Power LMM 0.06
# ---
# s = 0.002
# GLMM NaN Fail 0.4
# Power Waples 0.96
# Power LM 0.88
# Power GLMM 0.8333333
# Power GLMM (no NAs) 0.56
# Power LMM 0.88
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment