Created
September 20, 2024 18:45
-
-
Save sashagusev/06764448e14b6f6246c3f9364e7dbdb3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set.seed(1) | |
# Library for GLMM computations | |
library(PQLseq2) | |
# Library for mixed model / Wald test | |
library("GMMAT") | |
# convenience function to suppress prints | |
quiet <- function(x) { | |
sink(tempfile()) | |
on.exit(sink()) | |
invisible(force(x)) | |
} | |
RUN_LMM = TRUE | |
RUN_LMM2 = TRUE | |
RUN_GLMM = TRUE | |
# Simulation for the past 10k years / 400 generations | |
gens = as.integer(10e3/25) | |
# No. of neutral SNPs for the GRM calculation | |
M = 1e3 | |
# No. of SNPs to test under selection | |
M_selected = 200 | |
# No. of individuals to sample | |
samp_n = 200 | |
# --- | |
# Simulate different Nes for different levels of drift | |
for ( Ne in c(1e3,10e3,1e9) ) { | |
# Set the selection parameter (0=neutral) | |
for ( s in c(0,0.002) ) { | |
cat("\nSimulating Ne:",Ne,"and s:",s,'\n') | |
# Neutral variants | |
neutral_mafs = matrix(NA,nrow=gens,ncol=M) | |
neutral_mafs[1,] = runif(M,0.1,0.9) | |
for ( g in 2:gens ) { | |
keep = neutral_mafs[g-1,] > 0 & neutral_mafs[g-1,] < 1 | |
# Add drift | |
neutral_mafs[g,keep] = neutral_mafs[g-1,keep] + rnorm(sum(keep),0,sqrt( neutral_mafs[g-1,keep] * (1-neutral_mafs[g-1,keep]) / (2*Ne) )) | |
neutral_mafs[ g:gens , which(neutral_mafs[g,] < 0) ] = 0 | |
neutral_mafs[ g:gens , which(neutral_mafs[g,] > 1) ] = 1 | |
} | |
neutral_mafs[ neutral_mafs < 0 ] = 0 | |
neutral_mafs[ neutral_mafs > 1 ] = 1 | |
# Selected variants | |
selected_mafs = matrix(NA,nrow=gens,ncol=M_selected) | |
selected_mafs[1,] = runif(M_selected,0.1,0.9) | |
for ( g in 2:gens ) { | |
keep = selected_mafs[g-1,] > 0 & selected_mafs[g-1,] < 1 | |
# Add drift | |
prev_maf = selected_mafs[g-1,keep] | |
selected_mafs[g,keep] = prev_maf + s*prev_maf*(1-prev_maf)/(1+2*s*prev_maf) + rnorm(sum(keep),0,sqrt( prev_maf * (1-prev_maf) / (2*Ne) )) | |
selected_mafs[ g:gens , which(selected_mafs[g,] < 0) ] = 0 | |
selected_mafs[ g:gens , which(selected_mafs[g,] > 1) ] = 1 | |
} | |
selected_mafs[ selected_mafs < 0 ] = 0 | |
selected_mafs[ selected_mafs > 1 ] = 1 | |
# set the sampling time (when each aDNA sample was collected) | |
sample_time = sample(1:gens,samp_n) | |
# generate neutral variants | |
# Matrix for the null SNPs | |
snp_neutral = matrix(NA,nrow=samp_n,ncol=M) | |
for ( i in 1:M ) { | |
# for each individual, take the allele frequeny from *their* sampling time | |
snp_neutral[,i] = rbinom(samp_n,2,neutral_mafs[sample_time,i]) | |
} | |
# generate variants under selection | |
snp_selected = matrix(NA,nrow=samp_n,ncol=M_selected) | |
for ( i in 1:M_selected ) { | |
# for each individual, take the allele frequeny from *their* sampling time | |
snp_selected[,i] = rbinom(samp_n,2,selected_mafs[sample_time,i]) | |
} | |
# --- make the neutral snp GRM | |
# center and scale the genotypes | |
# genos = scale(rbind(snp0,snp1)) | |
# center and scale using frequencies | |
genos = snp_neutral | |
mafs = apply(genos,2,mean)/2 | |
for ( i in 1:M ) { | |
genos[,i] = (genos[,i] - 2*mafs[i])/sqrt(2*mafs[i]*(1-mafs[i])) | |
} | |
# compute the GRM | |
grm = genos %*% t(genos) / M | |
# add a small ridge to make this matrix non-singular | |
grm = 0.9 * grm + 0.1 * diag(nrow(grm)) | |
# As a sanity check, compute PCs | |
eigenvec = prcomp( grm )$rotation | |
# check correlation with sampling time | |
cat("Correlation w/ PC1",cor(eigenvec[,1],sample_time),'\n') | |
# Selection scan | |
test_population_time = sample_time + rnorm(length(sample_time),0,sqrt(0.1*var(sample_time))) | |
z_lm = rep(NA,M_selected) | |
z_glmm = rep(NA,M_selected) | |
# --- Now test the SNPs under selection | |
# Selection coefficients | |
for ( i in 1:M_selected ) { | |
test_snp_alleles = matrix(snp_selected[,i],nrow=1) | |
rownames(test_snp_alleles) = "snp1" | |
test_total_alleles = matrix(rep(2,length(test_snp_alleles)),nrow=1) | |
# test with a standard LM (should be inflated by drift) | |
reg = summary(lm( test_population_time ~ snp_selected[,i] )) | |
z_lm[i] = reg$coef[2,3] | |
if ( RUN_GLMM ) { | |
# test with a GLMM | |
# hide outputs | |
reg = quiet(pqlseq2( Y=test_snp_alleles , x=test_population_time , K=grm , lib_size = test_total_alleles , filter=F , verbose=F , model="BMM" , check_K=T , fix_h2eq1=T )) | |
# Same run but with the original pqlseq | |
# reg = quiet(pqlseq( RawCountDataSet=test_snp_alleles , Phenotypes=test_population_time , RelatednessMatrix=grm , LibSize = test_total_alleles , filtering=F , fit.model="BMM" )) | |
if ( is.na(reg$h2) || reg$h2 == 0 || reg$converged == FALSE ) { | |
z_glmm[i] = NA | |
} else { | |
z_glmm[i] = reg$beta / reg$se_beta | |
} | |
} | |
} | |
if ( RUN_LMM ) { | |
# Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it | |
rownames(snp_selected) = paste("sample_",1:nrow(snp_selected),sep='') | |
colnames(snp_selected) = paste("snp_",1:ncol(snp_selected),sep='') | |
write.table( t(snp_selected) , row.names=T, col.names=F, sep='\t' , quote=F , file="tmp.geno.txt") | |
gmmat_df = data.frame("disease" = test_population_time , "id" = rownames(snp_selected) ) | |
rownames(grm) = rownames(snp_selected) | |
colnames(grm) = rownames(snp_selected) | |
# Run GMMAT analysis | |
reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , is.dosage=TRUE, snps=colnames(snp_selected) , verbose=FALSE ) | |
z_lmm = reg$BETA / reg$SE | |
} else { | |
z_lmm = NA | |
} | |
if ( RUN_LMM2 ) { | |
# Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it | |
z_lmm2 = rep(NA,ncol(snp_selected)) | |
rownames(snp_selected) = paste("sample_",1:nrow(snp_selected),sep='') | |
colnames(snp_selected) = paste("snp_",1:ncol(snp_selected),sep='') | |
rownames(grm) = rownames(snp_selected) | |
colnames(grm) = rownames(snp_selected) | |
time_pseudo_snp = 2*(test_population_time - min(test_population_time)) / (max(test_population_time) - min(test_population_time)) | |
write.table( t(time_pseudo_snp) , row.names="snp", col.names=F, sep='\t' , quote=F , file="tmp.geno.txt") | |
for ( i in 1:ncol(snp_selected) ) { | |
gmmat_df = data.frame("disease" = scale(snp_selected[,i]) , "id" = rownames(snp_selected) ) | |
# Run GMMAT analysis | |
reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , is.dosage=TRUE , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , snps="snp" , verbose=FALSE ) | |
z_lmm2[i] = reg$BETA / reg$SE | |
} | |
} else { | |
z_lmm2 = NA | |
} | |
cat("GLMM NaN Fail",mean(is.na(z_glmm)),'\n') | |
cat("Power LM",mean(z_lm^2 > 1.96^2),'\n') | |
cat("Power GLMM (geno ~ time)",mean(z_glmm^2 > 1.96^2,na.rm=T),'\n') | |
cat("Power LMM (geno ~ time)",mean(z_lmm2^2 > 1.96^2),'\n') | |
cat("Power LMM (time ~ geno)",mean(z_lmm^2 > 1.96^2),'\n') | |
cat( paste(mean(z_lm^2 > 1.96^2) , mean(z_glmm^2 > 1.96^2,na.rm=T) , mean(z_lmm2^2 > 1.96^2) , mean(z_lmm^2 > 1.96^2) ,sep=',') , '\n' ) | |
} | |
} | |
# Expected Output | |
# Simulating Ne: 1000 and s: 0 | |
# Correlation w/ PC1 -0.9767579 | |
# GLMM NaN Fail 0.29 | |
# Power LM 0.53 | |
# Power GLMM (geno ~ time) 0.4788732 | |
# Power LMM (geno ~ time) 0.34 | |
# Power LMM (time ~ geno) 0.115 | |
# 0.53,0.47887323943662,0.34,0.115 | |
# | |
# Simulating Ne: 1000 and s: 0.002 | |
# Correlation w/ PC1 0.9802674 | |
# GLMM NaN Fail 0.365 | |
# Power LM 0.68 | |
# Power GLMM (geno ~ time) 0.5433071 | |
# Power LMM (geno ~ time) 0.48 | |
# Power LMM (time ~ geno) 0.135 | |
# 0.68,0.543307086614173,0.48,0.135 | |
# | |
# Simulating Ne: 10000 and s: 0 | |
# Correlation w/ PC1 -0.7918637 | |
# GLMM NaN Fail 0.53 | |
# Power LM 0.155 | |
# Power GLMM (geno ~ time) 0.1595745 | |
# Power LMM (geno ~ time) 0.145 | |
# Power LMM (time ~ geno) 0.06 | |
# 0.155,0.159574468085106,0.145,0.06 | |
# | |
# Simulating Ne: 10000 and s: 0.002 | |
# Correlation w/ PC1 0.8033265 | |
# GLMM NaN Fail 0.455 | |
# Power LM 0.525 | |
# Power GLMM (geno ~ time) 0.440367 | |
# Power LMM (geno ~ time) 0.5 | |
# Power LMM (time ~ geno) 0.305 | |
# 0.525,0.440366972477064,0.5,0.305 | |
# | |
# Simulating Ne: 1e+09 and s: 0 | |
# Correlation w/ PC1 0.0171917 | |
# GLMM NaN Fail 0.435 | |
# Power LM 0.045 | |
# Power GLMM (geno ~ time) 0.02654867 | |
# Power LMM (geno ~ time) 0.04 | |
# Power LMM (time ~ geno) 0.045 | |
# 0.045,0.0265486725663717,0.04,0.045 | |
# | |
# Simulating Ne: 1e+09 and s: 0.002 | |
# Correlation w/ PC1 -0.0787399 | |
# GLMM NaN Fail 0.505 | |
# Power LM 0.43 | |
# Power GLMM (geno ~ time) 0.4141414 | |
# Power LMM (geno ~ time) 0.43 | |
# Power LMM (time ~ geno) 0.43 | |
# 0.43,0.414141414141414,0.43,0.43 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment