Last active
September 18, 2024 21:20
-
-
Save sashagusev/a217b5ad13208d045a9d7d7c040b7593 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set.seed(42) | |
# Library for GLMM computations | |
library(PQLseq2) | |
# Library for mixed model / Wald test | |
library("GMMAT") | |
# convenience function to suppress prints | |
quiet <- function(x) { | |
sink(tempfile()) | |
on.exit(sink()) | |
invisible(force(x)) | |
} | |
RUN_LMM = TRUE | |
# We will simulate two populations with drift | |
# Simulation for the past 10k years / 400 generations | |
gens = as.integer(10e3/25) | |
# Use neutral SNPs for the GRM calculation | |
M = 1e3 | |
# No. of SNPs to test under selection | |
M_selected = 50 | |
# 100 individuals in each population | |
samp_0 = 100 | |
samp_1 = 100 | |
# --- | |
# Simulate different Nes for different levels of drift | |
par(mfrow=c(1,3)) | |
for ( Ne in c(1e3,10e3,1e6) ) { | |
cat("\n\nSimulating Ne:",Ne,'\n') | |
# First population | |
p0 = runif(M,0.1,0.9) | |
# Compute drift variance | |
drift_var = p0 * (1-p0) * (gens / (2*Ne)) | |
# Second population | |
p1 = p0 + rnorm(M,0,sqrt(drift_var)) | |
p1[ p1 < 0 ] = 0 | |
p1[ p1 > 1 ] = 1 | |
# Matrices for the null SNPs | |
snp0 = matrix(NA,nrow=samp_0,ncol=M) | |
snp1 = matrix(NA,nrow=samp_1,ncol=M) | |
for ( i in 1:M ) { | |
# generate samples from the two populations | |
snp0[,i] = rbinom(samp_0,2,p0[i]) | |
snp1[,i] = rbinom(samp_1,2,p1[i]) | |
} | |
# center and scale the genotypes | |
# genos = scale(rbind(snp0,snp1)) | |
# center and scale using frequencies (as in paper) | |
genos = rbind(snp0,snp1) | |
mafs = apply(genos,2,mean)/2 | |
for ( i in 1:M ) { | |
genos[,i] = (genos[,i] - 2*mafs[i])/sqrt(2*mafs[i]*(1-mafs[i])) | |
} | |
# compute the GRM | |
grm = genos %*% t(genos) / M | |
# add a small ridge to make this matrix non-singular | |
grm = 0.9 * grm + 0.1 * diag(nrow(grm)) | |
# As a sanity check, compute PCs | |
eigenvec = prcomp( grm )$rotation | |
# check correlation with pop label | |
pop_lbl = c(rep(0,samp_0),rep(1,samp_1)) | |
cat("Correlation w/ PC1",cor(eigenvec[,1],pop_lbl),'\n') | |
# --- Now generate new SNPs under selection and test them | |
# Selection coefficients | |
for ( s in c(0,0.002) ) { | |
cat("---\ns =" , s , '\n') | |
p0 = rep(NA,M_selected) | |
p1 = rep(NA,M_selected) | |
chisq_waples = rep(NA,M_selected) | |
z_lm = rep(NA,M_selected) | |
z_glmm = rep(NA,M_selected) | |
# test for association with number of generations apart (in this case it is just population label times number of generations) | |
test_population_time = pop_lbl*gens | |
# let's add some noise to the dates make it more realistic | |
test_population_time = test_population_time + rnorm(length(test_population_time),0,sqrt(0.1*var(test_population_time))) | |
selected_snps = matrix(NA,nrow=samp_0+samp_1,ncol=M_selected) | |
for ( i in 1:M_selected ) { | |
# compute target_maf for this s iteratively (there has to be a derivation for this) | |
p0[i] = runif(1,0.1,0.9) | |
p_trajectory = rep(NA,gens) | |
p_trajectory[1] = p0[i] | |
for ( g in 2:gens ) { | |
p_trajectory[g] = p_trajectory[g-1] + s*p_trajectory[g-1]*(1-p_trajectory[g-1])/(1+2*s*p_trajectory[g-1]) | |
} | |
p1[i] = tail(p_trajectory,1) | |
# add drift | |
drift_var = p0[i] * (1-p0[i]) * (gens / (2*Ne)) | |
# Second population | |
p1[i] = p1[i] + rnorm(1,0,sqrt(drift_var)) | |
if(p1[i] > 1) p1[i] = 1 | |
if(p1[i] < 0) p1[i] = 0 | |
# Sample the variants | |
cur_snp0 = rbinom(samp_0,2,p0[i]) | |
cur_snp1 = rbinom(samp_1,2,p1[i]) | |
test_snp_alleles = matrix(c(cur_snp0,cur_snp1),nrow=1) | |
selected_snps[,i] = test_snp_alleles | |
rownames(test_snp_alleles) = "snp1" | |
test_total_alleles = matrix(rep(2,length(test_snp_alleles)),nrow=1) | |
# test with a standard LM (should be inflated by drift) | |
reg = summary(lm( c(test_population_time) ~ c(test_snp_alleles) )) | |
z_lm[i] = reg$coef[2,3] | |
# test with a GLMM | |
# hide outputs | |
reg = quiet(pqlseq2( Y=test_snp_alleles , x=test_population_time , K=grm , lib_size = test_total_alleles , filter=F , verbose=F , model="BMM" , check_K=T , fix_h2eq1=T )) | |
# Same run but with the original pqlseq | |
# reg = quiet(pqlseq( RawCountDataSet=test_snp_alleles , Phenotypes=test_population_time , RelatednessMatrix=grm , LibSize = test_total_alleles , filtering=F , fit.model="BMM" )) | |
if ( is.na(reg$h2) || reg$h2 == 0 || reg$converged == FALSE ) { | |
z_glmm[i] = NA | |
} else { | |
z_glmm[i] = reg$beta / reg$se_beta | |
} | |
# compute the standard Waples statistic (assuming we know Ne and starting frequency perfectly) | |
cur_p0 = mean(cur_snp0/2) | |
cur_p1 = mean(cur_snp1/2) | |
waples_drift_var = p0[i] * (1-p0[i]) * ( 1/(2*samp_1) + (gens / (2*Ne))*(1-1/(2*samp_1)) ) | |
chisq_waples[i] = (cur_p1 - cur_p0)^2 / ( waples_drift_var ) | |
} | |
if ( s == 0 ) { | |
plot(p0,p1,xlim=c(0,1),ylim=c(0,1),main=paste("Ne = ",Ne,sep=''),las=1) | |
abline(0,1,lty=3) | |
} else { | |
points(p0,p1,col="red") | |
} | |
if ( RUN_LMM ) { | |
# Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it | |
rownames(selected_snps) = paste("sample_",1:nrow(selected_snps),sep='') | |
colnames(selected_snps) = paste("snp_",1:ncol(selected_snps),sep='') | |
write.table( t(selected_snps) , row.names=T, col.names=F, sep='\t' , quote=F , file="tmp.geno.txt") | |
gmmat_df = data.frame("disease" = test_population_time , "id" = rownames(selected_snps) ) | |
rownames(grm) = rownames(selected_snps) | |
colnames(grm) = rownames(selected_snps) | |
# Run GMMAT analysis | |
reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , is.dosage=TRUE, snps=colnames(selected_snps) , verbose=FALSE ) | |
z_lmm = reg$BETA / reg$SE | |
} else { | |
z_lmm = NA | |
} | |
cat("GLMM NaN Fail",mean(is.na(z_glmm)),'\n') | |
cat("Power Waples",mean(chisq_waples > 1.96^2),'\n') | |
cat("Power LM",mean(z_lm^2 > 1.96^2),'\n') | |
cat("Power GLMM",mean(z_glmm^2 > 1.96^2,na.rm=T),'\n') | |
cat("Power GLMM (no NAs)",sum(z_glmm^2 > 1.96,na.rm=T)/length(z_glmm),'\n') | |
cat("Power LMM",mean(z_lmm^2 > 1.96^2),'\n') | |
} | |
} | |
# Expected Output | |
# | |
# | |
# Simulating Ne: 1000 | |
# Correlation w/ PC1 0.9959432 | |
# --- | |
# s = 0 | |
# GLMM NaN Fail 0.38 | |
# Power Waples 0.02 | |
# Power LM 0.58 | |
# Power GLMM 0.5483871 | |
# Power GLMM (no NAs) 0.4 | |
# Power LMM 0.04 | |
# --- | |
# s = 0.002 | |
# GLMM NaN Fail 0.36 | |
# Power Waples 0.12 | |
# Power LM 0.84 | |
# Power GLMM 0.71875 | |
# Power GLMM (no NAs) 0.5 | |
# Power LMM 0.12 | |
# | |
# | |
# Simulating Ne: 10000 | |
# Correlation w/ PC1 0.929122 | |
# --- | |
# s = 0 | |
# GLMM NaN Fail 0.56 | |
# Power Waples 0.08 | |
# Power LM 0.28 | |
# Power GLMM 0.2272727 | |
# Power GLMM (no NAs) 0.2 | |
# Power LMM 0.08 | |
# --- | |
# s = 0.002 | |
# GLMM NaN Fail 0.52 | |
# Power Waples 0.6 | |
# Power LM 0.78 | |
# Power GLMM 0.7916667 | |
# Power GLMM (no NAs) 0.4 | |
# Power LMM 0.54 | |
# | |
# | |
# Simulating Ne: 1e+06 | |
# Correlation w/ PC1 -0.02720476 | |
# --- | |
# s = 0 | |
# GLMM NaN Fail 0.5 | |
# Power Waples 0.1 | |
# Power LM 0.06 | |
# Power GLMM 0.04 | |
# Power GLMM (no NAs) 0.04 | |
# Power LMM 0.06 | |
# --- | |
# s = 0.002 | |
# GLMM NaN Fail 0.4 | |
# Power Waples 0.96 | |
# Power LM 0.88 | |
# Power GLMM 0.8333333 | |
# Power GLMM (no NAs) 0.56 | |
# Power LMM 0.88 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment