sashagusev · September 17, 2024 03:42
diff --git a/ancientDNA_selection_sim.R b/ancientDNA_selection_sim.R
 set.seed(42)

 # Library for GLMM computations
 library(PQLseq2)

 quiet <- function(x) { 
  sink(tempfile()) 
  on.exit(sink()) 
  invisible(force(x)) 
 } 

 # We will simulate two populations with drift
 # Simulation for the past 10k years
 gens = as.integer(10e3/25)
 # Use 500 neutral SNPs for the GRM calculation
 M = 500
 # No. of SNPs to test under selection
 M_selected = 100
 # 100 individuals in each population
 samp_0 = 100
 samp_1 = 100
 # 50% MAF in the starting population
 pstart = 0.5
 # ---
 # Simulate Ne=10k and very large
 for ( Ne in c(10e3,1e6) ) {
  cat("\n\nSimulating Ne:",Ne,'\n')
  
  # Compute drift variance
  drift_var = pstart * (1-pstart) * (gens / (2*Ne))
  # First population
  p0 = rep(pstart,M)
  # Second population
  p1 = p0 + rnorm(M,0,sqrt(drift_var))
  p1[ p1 < 0 ] = 0
  p1[ p1 > 1 ] = 1
  # Matrices for the null SNPs
  snp0 = matrix(NA,nrow=samp_0,ncol=M)
  snp1 = matrix(NA,nrow=samp_1,ncol=M)
  
  for ( i in 1:M ) {
    # generate samples from the two populations
    snp0[,i] = rbinom(samp_0,2,p0[i])
    snp1[,i] = rbinom(samp_1,2,p1[i])
  }
  
  # compute the GRM
  genos = scale(rbind(snp0,snp1))
  grm = genos %*% t(genos) / (M-1)
  # compute PC1
  pc1 = prcomp(t(genos) )$rotation[,1]
  # check correlation with pop label
  pop_lbl = c(rep(0,samp_0),rep(1,samp_1))
  cat("Correlation w/ PC1",cor(pc1,pop_lbl),'\n')
  
  # --- Now generate new SNPs under selection and test them
  # Selection coefficients
  for ( s in c(0,0.001,0.002) ) {
    # compute target_maf for this s iteratively (there has to be a derivation for this)
    p_trajectory = rep(NA,gens)
    p_trajectory[1] = pstart
    for ( g in 2:gens ) {
      p_trajectory[g] = p_trajectory[g-1] + s*p_trajectory[g-1]*(1-p_trajectory[g-1])/(1+2*s*p_trajectory[g-1])
    }
    ##plot(1:gens,p_trajectory,type="l",ylim=c(0,0.5),las=1)
    target_maf = tail(p_trajectory,1)
    cat("---\ns =" , s , ", starting MAF =", pstart , ", ending MAF =" , target_maf , '\n')
    
    p0 = rep(0.5,M_selected)
    p1 = target_maf + rnorm(M,0,sqrt(drift_var))
    p1[ p1 < 0 ] = 0
    p1[ p1 > 1 ] = 1
    
    chisq_waples = rep(NA,M_selected)
    z_lm = rep(NA,M_selected)
    z_lm_pc = rep(NA,M_selected)
    z_glm = rep(NA,M_selected)
    z_glm_pc = rep(NA,M_selected)    
    z_glmm = rep(NA,M_selected)
    z_glmm_pc = rep(NA,M_selected)
    
    for ( i in 1:M_selected ) {
      # test with GLMM model
      cur_snp0 = rbinom(samp_0,2,p0[i])
      cur_snp1 = rbinom(samp_1,2,p1[i])
      test_y = matrix(c(cur_snp0,cur_snp1),nrow=1)
      rownames(test_y) = "snp1"
      test_total = matrix(rep(2,length(test_y)),nrow=1)
      # test for association with number of generations apart
      test_x = pop_lbl*gens
      
      # test with standard regression ( should be inflated due to drift )
      reg = summary(lm(c(test_y) ~ test_x))
      z_lm[i] = reg$coef[2,3]
      # test standard regression with pc covariate
      reg = summary(lm(c(test_y) ~ test_x + pc1))
      z_lm_pc[i] = reg$coef[2,3]

      # test with standard glm ( should be inflated due to drift )
      reg = summary(glm( cbind(c(test_y),2-c(test_y)) ~ test_x,family="binomial"))
      z_glm[i] = reg$coef[2,3]
      # test with standard glm with pc covariate
      reg = summary(glm( cbind(c(test_y),2-c(test_y)) ~ test_x + pc1 ,family="binomial"))
      z_glm_pc[i] = reg$coef[2,3]
      
      # test with a GLMM
      # hide outputs
      reg = quiet(pqlseq2( test_y , test_x , grm , lib_size = test_total , filter=F , verbose=F , model="BMM" ))
      z_glmm[i] = reg$beta / reg$se_beta
      
      # compute the Waples statistic (assuming we know Ne)
      cur_p0 = mean(cur_snp0/2)
      cur_p1 = mean(cur_snp1/2)
      cur_drift_var = cur_p1 * (1-cur_p1) * ( 1/(2*samp_0) + 1/(2*samp_1) + (gens / (2*Ne))*(1-1/(2*samp_1)) )
      chisq_waples[i] = (cur_p1 - cur_p0)^2 / ( cur_drift_var )
    }
    
    cat("Power Waples",mean(chisq_waples > 1.96^2),'\n')
    cat("Power LM",mean(z_lm^2 > 1.96^2),'\n')
    cat("Power LM with PCs",mean(z_lm_pc^2 > 1.96^2),'\n')
    cat("Power GLM",mean(z_glm^2 > 1.96^2),'\n')
    cat("Power GLM with PCs",mean(z_glm_pc^2 > 1.96^2),'\n')    
    cat("Power GLLM (Akbari etal. method)",mean(z_glmm^2 > 1.96^2),'\n')
  }
 }

 # Expected Output:
 # Simulating Ne: 10000 
 # Correlation w/ PC1 -0.8192212 
 # ---
 #   s = 0 , starting MAF = 0.5 , ending MAF = 0.5 
 # Power Waples 0.05 
 # Power LM 0.24 
 # Power LM with PCs 0.15 
 # Power GLM 0.24 
 # Power GLM with PCs 0.14 
 # Power GLLM (Akbari etal. method) 0.22 
 # ---
 #   s = 0.001 , starting MAF = 0.5 , ending MAF = 0.5983469 
 # Power Waples 0.22 
 # Power LM 0.52 
 # Power LM with PCs 0.26 
 # Power GLM 0.52 
 # Power GLM with PCs 0.23 
 # Power GLLM (Akbari etal. method) 0.51 
 # ---
 #   s = 0.002 , starting MAF = 0.5 , ending MAF = 0.6891725 
 # Power Waples 0.64 
 # Power LM 0.87 
 # Power LM with PCs 0.56 
 # Power GLM 0.87 
 # Power GLM with PCs 0.55 
 # Power GLLM (Akbari etal. method) 0.84 
 # 
 # 
 # Simulating Ne: 1e+06 
 # Correlation w/ PC1 0.02093607 
 # ---
 #   s = 0 , starting MAF = 0.5 , ending MAF = 0.5 
 # Power Waples 0.05 
 # Power LM 0.05 
 # Power LM with PCs 0.05 
 # Power GLM 0.05 
 # Power GLM with PCs 0.05 
 # Power GLLM (Akbari etal. method) 0.05 
 # ---
 #   s = 0.001 , starting MAF = 0.5 , ending MAF = 0.5983469 
 # Power Waples 0.64 
 # Power LM 0.66 
 # Power LM with PCs 0.66 
 # Power GLM 0.64 
 # Power GLM with PCs 0.64 
 # Power GLLM (Akbari etal. method) 0.61 
 # ---
 #   s = 0.002 , starting MAF = 0.5 , ending MAF = 0.6891725 
 # Power Waples 0.95 
 # Power LM 0.95 
 # Power LM with PCs 0.95 
 # Power GLM 0.94 
 # Power GLM with PCs 0.94 
 # Power GLLM (Akbari etal. method) 0.94
	set.seed(42)

	# Library for GLMM computations
	library(PQLseq2)

	quiet <- function(x) {
	sink(tempfile())
	on.exit(sink())
	invisible(force(x))
	}

	# We will simulate two populations with drift
	# Simulation for the past 10k years
	gens = as.integer(10e3/25)
	# Use 500 neutral SNPs for the GRM calculation
	M = 500
	# No. of SNPs to test under selection
	M_selected = 100
	# 100 individuals in each population
	samp_0 = 100
	samp_1 = 100
	# 50% MAF in the starting population
	pstart = 0.5
	# ---
	# Simulate Ne=10k and very large
	for ( Ne in c(10e3,1e6) ) {
	cat("\n\nSimulating Ne:",Ne,'\n')

	# Compute drift variance
	drift_var = pstart * (1-pstart) * (gens / (2*Ne))
	# First population
	p0 = rep(pstart,M)
	# Second population
	p1 = p0 + rnorm(M,0,sqrt(drift_var))
	p1[ p1 < 0 ] = 0
	p1[ p1 > 1 ] = 1
	# Matrices for the null SNPs
	snp0 = matrix(NA,nrow=samp_0,ncol=M)
	snp1 = matrix(NA,nrow=samp_1,ncol=M)

	for ( i in 1:M ) {
	# generate samples from the two populations
	snp0[,i] = rbinom(samp_0,2,p0[i])
	snp1[,i] = rbinom(samp_1,2,p1[i])
	}

	# compute the GRM
	genos = scale(rbind(snp0,snp1))
	grm = genos %*% t(genos) / (M-1)
	# compute PC1
	pc1 = prcomp(t(genos) )$rotation[,1]
	# check correlation with pop label
	pop_lbl = c(rep(0,samp_0),rep(1,samp_1))
	cat("Correlation w/ PC1",cor(pc1,pop_lbl),'\n')

	# --- Now generate new SNPs under selection and test them
	# Selection coefficients
	for ( s in c(0,0.001,0.002) ) {
	# compute target_maf for this s iteratively (there has to be a derivation for this)
	p_trajectory = rep(NA,gens)
	p_trajectory[1] = pstart
	for ( g in 2:gens ) {
	p_trajectory[g] = p_trajectory[g-1] + sp_trajectory[g-1](1-p_trajectory[g-1])/(1+2sp_trajectory[g-1])
	}
	##plot(1:gens,p_trajectory,type="l",ylim=c(0,0.5),las=1)
	target_maf = tail(p_trajectory,1)
	cat("---\ns =" , s , ", starting MAF =", pstart , ", ending MAF =" , target_maf , '\n')

	p0 = rep(0.5,M_selected)
	p1 = target_maf + rnorm(M,0,sqrt(drift_var))
	p1[ p1 < 0 ] = 0
	p1[ p1 > 1 ] = 1

	chisq_waples = rep(NA,M_selected)
	z_lm = rep(NA,M_selected)
	z_lm_pc = rep(NA,M_selected)
	z_glm = rep(NA,M_selected)
	z_glm_pc = rep(NA,M_selected)
	z_glmm = rep(NA,M_selected)
	z_glmm_pc = rep(NA,M_selected)

	for ( i in 1:M_selected ) {
	# test with GLMM model
	cur_snp0 = rbinom(samp_0,2,p0[i])
	cur_snp1 = rbinom(samp_1,2,p1[i])
	test_y = matrix(c(cur_snp0,cur_snp1),nrow=1)
	rownames(test_y) = "snp1"
	test_total = matrix(rep(2,length(test_y)),nrow=1)
	# test for association with number of generations apart
	test_x = pop_lbl*gens

	# test with standard regression ( should be inflated due to drift )
	reg = summary(lm(c(test_y) ~ test_x))
	z_lm[i] = reg$coef[2,3]
	# test standard regression with pc covariate
	reg = summary(lm(c(test_y) ~ test_x + pc1))
	z_lm_pc[i] = reg$coef[2,3]

	# test with standard glm ( should be inflated due to drift )
	reg = summary(glm( cbind(c(test_y),2-c(test_y)) ~ test_x,family="binomial"))
	z_glm[i] = reg$coef[2,3]
	# test with standard glm with pc covariate
	reg = summary(glm( cbind(c(test_y),2-c(test_y)) ~ test_x + pc1 ,family="binomial"))
	z_glm_pc[i] = reg$coef[2,3]

	# test with a GLMM
	# hide outputs
	reg = quiet(pqlseq2( test_y , test_x , grm , lib_size = test_total , filter=F , verbose=F , model="BMM" ))
	z_glmm[i] = reg$beta / reg$se_beta

	# compute the Waples statistic (assuming we know Ne)
	cur_p0 = mean(cur_snp0/2)
	cur_p1 = mean(cur_snp1/2)
	cur_drift_var = cur_p1 * (1-cur_p1) * ( 1/(2samp_0) + 1/(2samp_1) + (gens / (2Ne))(1-1/(2*samp_1)) )
	chisq_waples[i] = (cur_p1 - cur_p0)^2 / ( cur_drift_var )
	}

	cat("Power Waples",mean(chisq_waples > 1.96^2),'\n')
	cat("Power LM",mean(z_lm^2 > 1.96^2),'\n')
	cat("Power LM with PCs",mean(z_lm_pc^2 > 1.96^2),'\n')
	cat("Power GLM",mean(z_glm^2 > 1.96^2),'\n')
	cat("Power GLM with PCs",mean(z_glm_pc^2 > 1.96^2),'\n')
	cat("Power GLLM (Akbari etal. method)",mean(z_glmm^2 > 1.96^2),'\n')
	}
	}

	# Expected Output:
	# Simulating Ne: 10000
	# Correlation w/ PC1 -0.8192212
	# ---
	# s = 0 , starting MAF = 0.5 , ending MAF = 0.5
	# Power Waples 0.05
	# Power LM 0.24
	# Power LM with PCs 0.15
	# Power GLM 0.24
	# Power GLM with PCs 0.14
	# Power GLLM (Akbari etal. method) 0.22
	# ---
	# s = 0.001 , starting MAF = 0.5 , ending MAF = 0.5983469
	# Power Waples 0.22
	# Power LM 0.52
	# Power LM with PCs 0.26
	# Power GLM 0.52
	# Power GLM with PCs 0.23
	# Power GLLM (Akbari etal. method) 0.51
	# ---
	# s = 0.002 , starting MAF = 0.5 , ending MAF = 0.6891725
	# Power Waples 0.64
	# Power LM 0.87
	# Power LM with PCs 0.56
	# Power GLM 0.87
	# Power GLM with PCs 0.55
	# Power GLLM (Akbari etal. method) 0.84
	#
	#
	# Simulating Ne: 1e+06
	# Correlation w/ PC1 0.02093607
	# ---
	# s = 0 , starting MAF = 0.5 , ending MAF = 0.5
	# Power Waples 0.05
	# Power LM 0.05
	# Power LM with PCs 0.05
	# Power GLM 0.05
	# Power GLM with PCs 0.05
	# Power GLLM (Akbari etal. method) 0.05
	# ---
	# s = 0.001 , starting MAF = 0.5 , ending MAF = 0.5983469
	# Power Waples 0.64
	# Power LM 0.66
	# Power LM with PCs 0.66
	# Power GLM 0.64
	# Power GLM with PCs 0.64
	# Power GLLM (Akbari etal. method) 0.61
	# ---
	# s = 0.002 , starting MAF = 0.5 , ending MAF = 0.6891725
	# Power Waples 0.95
	# Power LM 0.95
	# Power LM with PCs 0.95
	# Power GLM 0.94
	# Power GLM with PCs 0.94
	# Power GLLM (Akbari etal. method) 0.94