sashagusev · September 18, 2024 21:20
diff --git a/ancient_dna_selection_with_lmm.R b/ancient_dna_selection_with_lmm.R
 set.seed(42)

 # Library for GLMM computations
 library(PQLseq2)
 # Library for mixed model / Wald test
 library("GMMAT")

 # convenience function to suppress prints
 quiet <- function(x) { 
  sink(tempfile()) 
  on.exit(sink()) 
  invisible(force(x)) 
 } 

 RUN_LMM = TRUE

 # We will simulate two populations with drift
 # Simulation for the past 10k years / 400 generations
 gens = as.integer(10e3/25)
 # Use neutral SNPs for the GRM calculation
 M = 1e3
 # No. of SNPs to test under selection
 M_selected = 50
 # 100 individuals in each population
 samp_0 = 100
 samp_1 = 100
 # ---
 # Simulate different Nes for different levels of drift
 par(mfrow=c(1,3))

 for ( Ne in c(1e3,10e3,1e6) ) {
  cat("\n\nSimulating Ne:",Ne,'\n')
  
  # First population
  p0 = runif(M,0.1,0.9)
  # Compute drift variance
  drift_var = p0 * (1-p0) * (gens / (2*Ne))
  # Second population
  p1 = p0 + rnorm(M,0,sqrt(drift_var))
  p1[ p1 < 0 ] = 0
  p1[ p1 > 1 ] = 1
  # Matrices for the null SNPs
  snp0 = matrix(NA,nrow=samp_0,ncol=M)
  snp1 = matrix(NA,nrow=samp_1,ncol=M)
  
  for ( i in 1:M ) {
    # generate samples from the two populations
    snp0[,i] = rbinom(samp_0,2,p0[i])
    snp1[,i] = rbinom(samp_1,2,p1[i])
  }
  
  # center and scale the genotypes
  # genos = scale(rbind(snp0,snp1))
  # center and scale using frequencies (as in paper)
  genos = rbind(snp0,snp1)
  mafs = apply(genos,2,mean)/2
  for ( i in 1:M ) {
    genos[,i] = (genos[,i] - 2*mafs[i])/sqrt(2*mafs[i]*(1-mafs[i]))
  }
  # compute the GRM
  grm = genos %*% t(genos) / M
  # add a small ridge to make this matrix non-singular
  grm = 0.9 * grm + 0.1 * diag(nrow(grm))
  # As a sanity check, compute PCs
  eigenvec = prcomp( grm )$rotation
  
  # check correlation with pop label
  pop_lbl = c(rep(0,samp_0),rep(1,samp_1))
  cat("Correlation w/ PC1",cor(eigenvec[,1],pop_lbl),'\n')
  
  # --- Now generate new SNPs under selection and test them
  # Selection coefficients
  for ( s in c(0,0.002) ) {
    cat("---\ns =" , s , '\n')
    
    p0 = rep(NA,M_selected)
    p1 = rep(NA,M_selected)
    
    chisq_waples = rep(NA,M_selected)
    z_lm = rep(NA,M_selected)
    z_glmm = rep(NA,M_selected)

    # test for association with number of generations apart (in this case it is just population label times number of generations)
    test_population_time = pop_lbl*gens
    # let's add some noise to the dates make it more realistic
    test_population_time = test_population_time + rnorm(length(test_population_time),0,sqrt(0.1*var(test_population_time)))
    selected_snps = matrix(NA,nrow=samp_0+samp_1,ncol=M_selected)
    
    for ( i in 1:M_selected ) {
      
      # compute target_maf for this s iteratively (there has to be a derivation for this)
      p0[i] = runif(1,0.1,0.9)
      p_trajectory = rep(NA,gens)
      p_trajectory[1] = p0[i]
      for ( g in 2:gens ) {
        p_trajectory[g] = p_trajectory[g-1] + s*p_trajectory[g-1]*(1-p_trajectory[g-1])/(1+2*s*p_trajectory[g-1])
      }
      p1[i] = tail(p_trajectory,1)
      # add drift
      drift_var = p0[i] * (1-p0[i]) * (gens / (2*Ne))
      # Second population
      p1[i] = p1[i] + rnorm(1,0,sqrt(drift_var))
      if(p1[i] > 1) p1[i] = 1
      if(p1[i] < 0) p1[i] = 0
      
      # Sample the variants
      cur_snp0 = rbinom(samp_0,2,p0[i])
      cur_snp1 = rbinom(samp_1,2,p1[i])
      test_snp_alleles = matrix(c(cur_snp0,cur_snp1),nrow=1)
      selected_snps[,i] = test_snp_alleles
      rownames(test_snp_alleles) = "snp1"
      test_total_alleles = matrix(rep(2,length(test_snp_alleles)),nrow=1)
      
      # test with a standard LM (should be inflated by drift)
      reg = summary(lm( c(test_population_time) ~ c(test_snp_alleles) ))
      z_lm[i] = reg$coef[2,3]
      
      # test with a GLMM
      # hide outputs
      reg = quiet(pqlseq2( Y=test_snp_alleles , x=test_population_time , K=grm , lib_size = test_total_alleles , filter=F , verbose=F , model="BMM" , check_K=T , fix_h2eq1=T ))
      # Same run but with the original pqlseq
      # reg = quiet(pqlseq( RawCountDataSet=test_snp_alleles , Phenotypes=test_population_time , RelatednessMatrix=grm , LibSize = test_total_alleles , filtering=F , fit.model="BMM" ))
      if ( is.na(reg$h2) || reg$h2 == 0 || reg$converged == FALSE ) {
        z_glmm[i] = NA
      } else {
        z_glmm[i] = reg$beta / reg$se_beta
      }

      # compute the standard Waples statistic (assuming we know Ne and starting frequency perfectly)
      cur_p0 = mean(cur_snp0/2)
      cur_p1 = mean(cur_snp1/2)
      waples_drift_var = p0[i] * (1-p0[i]) * ( 1/(2*samp_1) + (gens / (2*Ne))*(1-1/(2*samp_1)) )
      chisq_waples[i] = (cur_p1 - cur_p0)^2 / ( waples_drift_var )
    }
    
    if ( s == 0 ) {
      plot(p0,p1,xlim=c(0,1),ylim=c(0,1),main=paste("Ne = ",Ne,sep=''),las=1)
      abline(0,1,lty=3)
    } else {
      points(p0,p1,col="red")
    }
    
    if ( RUN_LMM ) {
      # Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it
      rownames(selected_snps) = paste("sample_",1:nrow(selected_snps),sep='')
      colnames(selected_snps) = paste("snp_",1:ncol(selected_snps),sep='')
      write.table( t(selected_snps) , row.names=T, col.names=F, sep='\t' , quote=F , file="tmp.geno.txt")
      gmmat_df = data.frame("disease" = test_population_time , "id" = rownames(selected_snps) )
      rownames(grm) = rownames(selected_snps)
      colnames(grm) = rownames(selected_snps)
      # Run GMMAT analysis
      reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , is.dosage=TRUE, snps=colnames(selected_snps) , verbose=FALSE )
      z_lmm = reg$BETA / reg$SE
    } else {
      z_lmm = NA
    }
    
    cat("GLMM NaN Fail",mean(is.na(z_glmm)),'\n')
    cat("Power Waples",mean(chisq_waples > 1.96^2),'\n')
    cat("Power LM",mean(z_lm^2 > 1.96^2),'\n')
    cat("Power GLMM",mean(z_glmm^2 > 1.96^2,na.rm=T),'\n')
    cat("Power GLMM (no NAs)",sum(z_glmm^2 > 1.96,na.rm=T)/length(z_glmm),'\n')
    cat("Power LMM",mean(z_lmm^2 > 1.96^2),'\n')
  }
 }

 # Expected Output
 # 
 #
 # Simulating Ne: 1000 
 # Correlation w/ PC1 0.9959432 
 # ---
 #   s = 0 
 # GLMM NaN Fail 0.38 
 # Power Waples 0.02 
 # Power LM 0.58 
 # Power GLMM 0.5483871 
 # Power GLMM (no NAs) 0.4 
 # Power LMM 0.04 
 # ---
 #   s = 0.002 
 # GLMM NaN Fail 0.36 
 # Power Waples 0.12 
 # Power LM 0.84 
 # Power GLMM 0.71875 
 # Power GLMM (no NAs) 0.5 
 # Power LMM 0.12 
 # 
 # 
 # Simulating Ne: 10000 
 # Correlation w/ PC1 0.929122 
 # ---
 #   s = 0 
 # GLMM NaN Fail 0.56 
 # Power Waples 0.08 
 # Power LM 0.28 
 # Power GLMM 0.2272727 
 # Power GLMM (no NAs) 0.2 
 # Power LMM 0.08 
 # ---
 #   s = 0.002 
 # GLMM NaN Fail 0.52 
 # Power Waples 0.6 
 # Power LM 0.78 
 # Power GLMM 0.7916667 
 # Power GLMM (no NAs) 0.4 
 # Power LMM 0.54 
 # 
 # 
 # Simulating Ne: 1e+06 
 # Correlation w/ PC1 -0.02720476 
 # ---
 #   s = 0 
 # GLMM NaN Fail 0.5 
 # Power Waples 0.1 
 # Power LM 0.06 
 # Power GLMM 0.04 
 # Power GLMM (no NAs) 0.04 
 # Power LMM 0.06 
 # ---
 #   s = 0.002 
 # GLMM NaN Fail 0.4 
 # Power Waples 0.96 
 # Power LM 0.88 
 # Power GLMM 0.8333333 
 # Power GLMM (no NAs) 0.56 
 # Power LMM 0.88
	set.seed(42)

	# Library for GLMM computations
	library(PQLseq2)
	# Library for mixed model / Wald test
	library("GMMAT")

	# convenience function to suppress prints
	quiet <- function(x) {
	sink(tempfile())
	on.exit(sink())
	invisible(force(x))
	}

	RUN_LMM = TRUE

	# We will simulate two populations with drift
	# Simulation for the past 10k years / 400 generations
	gens = as.integer(10e3/25)
	# Use neutral SNPs for the GRM calculation
	M = 1e3
	# No. of SNPs to test under selection
	M_selected = 50
	# 100 individuals in each population
	samp_0 = 100
	samp_1 = 100
	# ---
	# Simulate different Nes for different levels of drift
	par(mfrow=c(1,3))

	for ( Ne in c(1e3,10e3,1e6) ) {
	cat("\n\nSimulating Ne:",Ne,'\n')

	# First population
	p0 = runif(M,0.1,0.9)
	# Compute drift variance
	drift_var = p0 * (1-p0) * (gens / (2*Ne))
	# Second population
	p1 = p0 + rnorm(M,0,sqrt(drift_var))
	p1[ p1 < 0 ] = 0
	p1[ p1 > 1 ] = 1
	# Matrices for the null SNPs
	snp0 = matrix(NA,nrow=samp_0,ncol=M)
	snp1 = matrix(NA,nrow=samp_1,ncol=M)

	for ( i in 1:M ) {
	# generate samples from the two populations
	snp0[,i] = rbinom(samp_0,2,p0[i])
	snp1[,i] = rbinom(samp_1,2,p1[i])
	}

	# center and scale the genotypes
	# genos = scale(rbind(snp0,snp1))
	# center and scale using frequencies (as in paper)
	genos = rbind(snp0,snp1)
	mafs = apply(genos,2,mean)/2
	for ( i in 1:M ) {
	genos[,i] = (genos[,i] - 2mafs[i])/sqrt(2mafs[i]*(1-mafs[i]))
	}
	# compute the GRM
	grm = genos %*% t(genos) / M
	# add a small ridge to make this matrix non-singular
	grm = 0.9 * grm + 0.1 * diag(nrow(grm))
	# As a sanity check, compute PCs
	eigenvec = prcomp( grm )$rotation

	# check correlation with pop label
	pop_lbl = c(rep(0,samp_0),rep(1,samp_1))
	cat("Correlation w/ PC1",cor(eigenvec[,1],pop_lbl),'\n')

	# --- Now generate new SNPs under selection and test them
	# Selection coefficients
	for ( s in c(0,0.002) ) {
	cat("---\ns =" , s , '\n')

	p0 = rep(NA,M_selected)
	p1 = rep(NA,M_selected)

	chisq_waples = rep(NA,M_selected)
	z_lm = rep(NA,M_selected)
	z_glmm = rep(NA,M_selected)

	# test for association with number of generations apart (in this case it is just population label times number of generations)
	test_population_time = pop_lbl*gens
	# let's add some noise to the dates make it more realistic
	test_population_time = test_population_time + rnorm(length(test_population_time),0,sqrt(0.1*var(test_population_time)))
	selected_snps = matrix(NA,nrow=samp_0+samp_1,ncol=M_selected)

	for ( i in 1:M_selected ) {

	# compute target_maf for this s iteratively (there has to be a derivation for this)
	p0[i] = runif(1,0.1,0.9)
	p_trajectory = rep(NA,gens)
	p_trajectory[1] = p0[i]
	for ( g in 2:gens ) {
	p_trajectory[g] = p_trajectory[g-1] + sp_trajectory[g-1](1-p_trajectory[g-1])/(1+2sp_trajectory[g-1])
	}
	p1[i] = tail(p_trajectory,1)
	# add drift
	drift_var = p0[i] * (1-p0[i]) * (gens / (2*Ne))
	# Second population
	p1[i] = p1[i] + rnorm(1,0,sqrt(drift_var))
	if(p1[i] > 1) p1[i] = 1
	if(p1[i] < 0) p1[i] = 0

	# Sample the variants
	cur_snp0 = rbinom(samp_0,2,p0[i])
	cur_snp1 = rbinom(samp_1,2,p1[i])
	test_snp_alleles = matrix(c(cur_snp0,cur_snp1),nrow=1)
	selected_snps[,i] = test_snp_alleles
	rownames(test_snp_alleles) = "snp1"
	test_total_alleles = matrix(rep(2,length(test_snp_alleles)),nrow=1)

	# test with a standard LM (should be inflated by drift)
	reg = summary(lm( c(test_population_time) ~ c(test_snp_alleles) ))
	z_lm[i] = reg$coef[2,3]

	# test with a GLMM
	# hide outputs
	reg = quiet(pqlseq2( Y=test_snp_alleles , x=test_population_time , K=grm , lib_size = test_total_alleles , filter=F , verbose=F , model="BMM" , check_K=T , fix_h2eq1=T ))
	# Same run but with the original pqlseq
	# reg = quiet(pqlseq( RawCountDataSet=test_snp_alleles , Phenotypes=test_population_time , RelatednessMatrix=grm , LibSize = test_total_alleles , filtering=F , fit.model="BMM" ))
	if ( is.na(reg$h2) \|\| reg$h2 == 0 \|\| reg$converged == FALSE ) {
	z_glmm[i] = NA
	} else {
	z_glmm[i] = reg$beta / reg$se_beta
	}

	# compute the standard Waples statistic (assuming we know Ne and starting frequency perfectly)
	cur_p0 = mean(cur_snp0/2)
	cur_p1 = mean(cur_snp1/2)
	waples_drift_var = p0[i] * (1-p0[i]) * ( 1/(2samp_1) + (gens / (2Ne))(1-1/(2samp_1)) )
	chisq_waples[i] = (cur_p1 - cur_p0)^2 / ( waples_drift_var )
	}

	if ( s == 0 ) {
	plot(p0,p1,xlim=c(0,1),ylim=c(0,1),main=paste("Ne = ",Ne,sep=''),las=1)
	abline(0,1,lty=3)
	} else {
	points(p0,p1,col="red")
	}

	if ( RUN_LMM ) {
	# Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it
	rownames(selected_snps) = paste("sample_",1:nrow(selected_snps),sep='')
	colnames(selected_snps) = paste("snp_",1:ncol(selected_snps),sep='')
	write.table( t(selected_snps) , row.names=T, col.names=F, sep='\t' , quote=F , file="tmp.geno.txt")
	gmmat_df = data.frame("disease" = test_population_time , "id" = rownames(selected_snps) )
	rownames(grm) = rownames(selected_snps)
	colnames(grm) = rownames(selected_snps)
	# Run GMMAT analysis
	reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , is.dosage=TRUE, snps=colnames(selected_snps) , verbose=FALSE )
	z_lmm = reg$BETA / reg$SE
	} else {
	z_lmm = NA
	}

	cat("GLMM NaN Fail",mean(is.na(z_glmm)),'\n')
	cat("Power Waples",mean(chisq_waples > 1.96^2),'\n')
	cat("Power LM",mean(z_lm^2 > 1.96^2),'\n')
	cat("Power GLMM",mean(z_glmm^2 > 1.96^2,na.rm=T),'\n')
	cat("Power GLMM (no NAs)",sum(z_glmm^2 > 1.96,na.rm=T)/length(z_glmm),'\n')
	cat("Power LMM",mean(z_lmm^2 > 1.96^2),'\n')
	}
	}

	# Expected Output
	#
	#
	# Simulating Ne: 1000
	# Correlation w/ PC1 0.9959432
	# ---
	# s = 0
	# GLMM NaN Fail 0.38
	# Power Waples 0.02
	# Power LM 0.58
	# Power GLMM 0.5483871
	# Power GLMM (no NAs) 0.4
	# Power LMM 0.04
	# ---
	# s = 0.002
	# GLMM NaN Fail 0.36
	# Power Waples 0.12
	# Power LM 0.84
	# Power GLMM 0.71875
	# Power GLMM (no NAs) 0.5
	# Power LMM 0.12
	#
	#
	# Simulating Ne: 10000
	# Correlation w/ PC1 0.929122
	# ---
	# s = 0
	# GLMM NaN Fail 0.56
	# Power Waples 0.08
	# Power LM 0.28
	# Power GLMM 0.2272727
	# Power GLMM (no NAs) 0.2
	# Power LMM 0.08
	# ---
	# s = 0.002
	# GLMM NaN Fail 0.52
	# Power Waples 0.6
	# Power LM 0.78
	# Power GLMM 0.7916667
	# Power GLMM (no NAs) 0.4
	# Power LMM 0.54
	#
	#
	# Simulating Ne: 1e+06
	# Correlation w/ PC1 -0.02720476
	# ---
	# s = 0
	# GLMM NaN Fail 0.5
	# Power Waples 0.1
	# Power LM 0.06
	# Power GLMM 0.04
	# Power GLMM (no NAs) 0.04
	# Power LMM 0.06
	# ---
	# s = 0.002
	# GLMM NaN Fail 0.4
	# Power Waples 0.96
	# Power LM 0.88
	# Power GLMM 0.8333333
	# Power GLMM (no NAs) 0.56
	# Power LMM 0.88