sashagusev · September 20, 2024 18:45
diff --git a/ancient_dna_selection_continuum.R b/ancient_dna_selection_continuum.R
 set.seed(1)

 # Library for GLMM computations
 library(PQLseq2)
 # Library for mixed model / Wald test
 library("GMMAT")

 # convenience function to suppress prints
 quiet <- function(x) { 
  sink(tempfile()) 
  on.exit(sink()) 
  invisible(force(x)) 
 } 

 RUN_LMM = TRUE
 RUN_LMM2 = TRUE
 RUN_GLMM = TRUE

 # Simulation for the past 10k years / 400 generations
 gens = as.integer(10e3/25)
 # No. of neutral SNPs for the GRM calculation
 M = 1e3
 # No. of SNPs to test under selection
 M_selected = 200
 # No. of individuals to sample
 samp_n = 200

 # ---
 # Simulate different Nes for different levels of drift
 for ( Ne in c(1e3,10e3,1e9) ) {
  # Set the selection parameter (0=neutral)
  for ( s in c(0,0.002) ) {
    cat("\nSimulating Ne:",Ne,"and s:",s,'\n')
    
    # Neutral variants
    neutral_mafs = matrix(NA,nrow=gens,ncol=M)
    neutral_mafs[1,] = runif(M,0.1,0.9)
    for ( g in 2:gens ) {
      keep = neutral_mafs[g-1,] > 0 & neutral_mafs[g-1,] < 1
      # Add drift
      neutral_mafs[g,keep] = neutral_mafs[g-1,keep] + rnorm(sum(keep),0,sqrt( neutral_mafs[g-1,keep] * (1-neutral_mafs[g-1,keep]) / (2*Ne) ))
      neutral_mafs[ g:gens , which(neutral_mafs[g,] < 0) ] = 0
      neutral_mafs[ g:gens , which(neutral_mafs[g,] > 1) ] = 1
    }
    neutral_mafs[ neutral_mafs < 0 ] = 0
    neutral_mafs[ neutral_mafs > 1 ] = 1
    
    # Selected variants
    selected_mafs = matrix(NA,nrow=gens,ncol=M_selected)
    selected_mafs[1,] = runif(M_selected,0.1,0.9)
    for ( g in 2:gens ) {
      keep = selected_mafs[g-1,] > 0 & selected_mafs[g-1,] < 1
      # Add drift
      prev_maf = selected_mafs[g-1,keep]
      selected_mafs[g,keep] = prev_maf + s*prev_maf*(1-prev_maf)/(1+2*s*prev_maf) + rnorm(sum(keep),0,sqrt( prev_maf * (1-prev_maf) / (2*Ne) ))
      selected_mafs[ g:gens , which(selected_mafs[g,] < 0) ] = 0
      selected_mafs[ g:gens , which(selected_mafs[g,] > 1) ] = 1
    }
    selected_mafs[ selected_mafs < 0 ] = 0
    selected_mafs[ selected_mafs > 1 ] = 1  
    
    # set the sampling time (when each aDNA sample was collected)
    sample_time = sample(1:gens,samp_n)
    
    # generate neutral variants
    # Matrix for the null SNPs
    snp_neutral = matrix(NA,nrow=samp_n,ncol=M)
    for ( i in 1:M ) {
      # for each individual, take the allele frequeny from *their* sampling time
      snp_neutral[,i] = rbinom(samp_n,2,neutral_mafs[sample_time,i])
    }
    
    # generate variants under selection
    snp_selected = matrix(NA,nrow=samp_n,ncol=M_selected)
    for ( i in 1:M_selected ) {
      # for each individual, take the allele frequeny from *their* sampling time
      snp_selected[,i] = rbinom(samp_n,2,selected_mafs[sample_time,i])
    }
    
    # --- make the neutral snp GRM
    # center and scale the genotypes
    # genos = scale(rbind(snp0,snp1))
    # center and scale using frequencies
    genos = snp_neutral
    mafs = apply(genos,2,mean)/2
    for ( i in 1:M ) {
      genos[,i] = (genos[,i] - 2*mafs[i])/sqrt(2*mafs[i]*(1-mafs[i]))
    }
    # compute the GRM
    grm = genos %*% t(genos) / M
    # add a small ridge to make this matrix non-singular
    grm = 0.9 * grm + 0.1 * diag(nrow(grm))
    # As a sanity check, compute PCs
    eigenvec = prcomp( grm )$rotation
    
    # check correlation with sampling time
    cat("Correlation w/ PC1",cor(eigenvec[,1],sample_time),'\n')
    
    # Selection scan
    test_population_time = sample_time + rnorm(length(sample_time),0,sqrt(0.1*var(sample_time)))
    
    z_lm = rep(NA,M_selected)
    z_glmm = rep(NA,M_selected)
    
    # --- Now test the SNPs under selection
    # Selection coefficients
    for ( i in 1:M_selected ) {
      test_snp_alleles = matrix(snp_selected[,i],nrow=1)
      rownames(test_snp_alleles) = "snp1"
      test_total_alleles = matrix(rep(2,length(test_snp_alleles)),nrow=1)
      
      # test with a standard LM (should be inflated by drift)
      reg = summary(lm( test_population_time ~ snp_selected[,i] ))
      z_lm[i] = reg$coef[2,3]
      
      if ( RUN_GLMM ) {
        # test with a GLMM
        # hide outputs
        reg = quiet(pqlseq2( Y=test_snp_alleles , x=test_population_time , K=grm , lib_size = test_total_alleles , filter=F , verbose=F , model="BMM" , check_K=T , fix_h2eq1=T ))
        # Same run but with the original pqlseq
        # reg = quiet(pqlseq( RawCountDataSet=test_snp_alleles , Phenotypes=test_population_time , RelatednessMatrix=grm , LibSize = test_total_alleles , filtering=F , fit.model="BMM" ))
        if ( is.na(reg$h2) || reg$h2 == 0 || reg$converged == FALSE ) {
          z_glmm[i] = NA
        } else {
          z_glmm[i] = reg$beta / reg$se_beta
        }
      }
    }
  
    if ( RUN_LMM ) {
      # Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it
      rownames(snp_selected) = paste("sample_",1:nrow(snp_selected),sep='')
      colnames(snp_selected) = paste("snp_",1:ncol(snp_selected),sep='')
      write.table( t(snp_selected) , row.names=T, col.names=F, sep='\t' , quote=F , file="tmp.geno.txt")
      gmmat_df = data.frame("disease" = test_population_time , "id" = rownames(snp_selected) )
      rownames(grm) = rownames(snp_selected)
      colnames(grm) = rownames(snp_selected)
      # Run GMMAT analysis
      reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , is.dosage=TRUE, snps=colnames(snp_selected) , verbose=FALSE )
      z_lmm = reg$BETA / reg$SE
    } else {
      z_lmm = NA
    }
    
    if ( RUN_LMM2 ) {
      # Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it
      z_lmm2 = rep(NA,ncol(snp_selected))
      rownames(snp_selected) = paste("sample_",1:nrow(snp_selected),sep='')
      colnames(snp_selected) = paste("snp_",1:ncol(snp_selected),sep='')
      rownames(grm) = rownames(snp_selected)
      colnames(grm) = rownames(snp_selected)
      
      time_pseudo_snp = 2*(test_population_time - min(test_population_time)) / (max(test_population_time) - min(test_population_time))
      write.table( t(time_pseudo_snp) , row.names="snp", col.names=F, sep='\t' , quote=F , file="tmp.geno.txt")
      for ( i in 1:ncol(snp_selected) ) {
        gmmat_df = data.frame("disease" = scale(snp_selected[,i]) , "id" = rownames(snp_selected) )
        # Run GMMAT analysis
        reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , is.dosage=TRUE , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , snps="snp" , verbose=FALSE )
        z_lmm2[i] = reg$BETA / reg$SE
      }
    } else {
      z_lmm2 = NA
    }
    
    cat("GLMM NaN Fail",mean(is.na(z_glmm)),'\n')
    cat("Power LM",mean(z_lm^2 > 1.96^2),'\n')
    cat("Power GLMM (geno ~ time)",mean(z_glmm^2 > 1.96^2,na.rm=T),'\n')
    cat("Power LMM (geno ~ time)",mean(z_lmm2^2 > 1.96^2),'\n')
    cat("Power LMM (time ~ geno)",mean(z_lmm^2 > 1.96^2),'\n')
    cat( paste(mean(z_lm^2 > 1.96^2) , mean(z_glmm^2 > 1.96^2,na.rm=T) , mean(z_lmm2^2 > 1.96^2) , mean(z_lmm^2 > 1.96^2) ,sep=',') , '\n' )
  }
 }

 # Expected Output
 # Simulating Ne: 1000 and s: 0 
 # Correlation w/ PC1 -0.9767579 
 # GLMM NaN Fail 0.29 
 # Power LM 0.53 
 # Power GLMM (geno ~ time) 0.4788732 
 # Power LMM (geno ~ time) 0.34 
 # Power LMM (time ~ geno) 0.115 
 # 0.53,0.47887323943662,0.34,0.115 
 # 
 # Simulating Ne: 1000 and s: 0.002 
 # Correlation w/ PC1 0.9802674 
 # GLMM NaN Fail 0.365 
 # Power LM 0.68 
 # Power GLMM (geno ~ time) 0.5433071 
 # Power LMM (geno ~ time) 0.48 
 # Power LMM (time ~ geno) 0.135 
 # 0.68,0.543307086614173,0.48,0.135 
 # 
 # Simulating Ne: 10000 and s: 0 
 # Correlation w/ PC1 -0.7918637 
 # GLMM NaN Fail 0.53 
 # Power LM 0.155 
 # Power GLMM (geno ~ time) 0.1595745 
 # Power LMM (geno ~ time) 0.145 
 # Power LMM (time ~ geno) 0.06 
 # 0.155,0.159574468085106,0.145,0.06 
 # 
 # Simulating Ne: 10000 and s: 0.002 
 # Correlation w/ PC1 0.8033265 
 # GLMM NaN Fail 0.455 
 # Power LM 0.525 
 # Power GLMM (geno ~ time) 0.440367 
 # Power LMM (geno ~ time) 0.5 
 # Power LMM (time ~ geno) 0.305 
 # 0.525,0.440366972477064,0.5,0.305 
 # 
 # Simulating Ne: 1e+09 and s: 0 
 # Correlation w/ PC1 0.0171917 
 # GLMM NaN Fail 0.435 
 # Power LM 0.045 
 # Power GLMM (geno ~ time) 0.02654867 
 # Power LMM (geno ~ time) 0.04 
 # Power LMM (time ~ geno) 0.045 
 # 0.045,0.0265486725663717,0.04,0.045 
 # 
 # Simulating Ne: 1e+09 and s: 0.002 
 # Correlation w/ PC1 -0.0787399 
 # GLMM NaN Fail 0.505 
 # Power LM 0.43 
 # Power GLMM (geno ~ time) 0.4141414 
 # Power LMM (geno ~ time) 0.43 
 # Power LMM (time ~ geno) 0.43 
 # 0.43,0.414141414141414,0.43,0.43
	set.seed(1)

	# Library for GLMM computations
	library(PQLseq2)
	# Library for mixed model / Wald test
	library("GMMAT")

	# convenience function to suppress prints
	quiet <- function(x) {
	sink(tempfile())
	on.exit(sink())
	invisible(force(x))
	}

	RUN_LMM = TRUE
	RUN_LMM2 = TRUE
	RUN_GLMM = TRUE

	# Simulation for the past 10k years / 400 generations
	gens = as.integer(10e3/25)
	# No. of neutral SNPs for the GRM calculation
	M = 1e3
	# No. of SNPs to test under selection
	M_selected = 200
	# No. of individuals to sample
	samp_n = 200

	# ---
	# Simulate different Nes for different levels of drift
	for ( Ne in c(1e3,10e3,1e9) ) {
	# Set the selection parameter (0=neutral)
	for ( s in c(0,0.002) ) {
	cat("\nSimulating Ne:",Ne,"and s:",s,'\n')

	# Neutral variants
	neutral_mafs = matrix(NA,nrow=gens,ncol=M)
	neutral_mafs[1,] = runif(M,0.1,0.9)
	for ( g in 2:gens ) {
	keep = neutral_mafs[g-1,] > 0 & neutral_mafs[g-1,] < 1
	# Add drift
	neutral_mafs[g,keep] = neutral_mafs[g-1,keep] + rnorm(sum(keep),0,sqrt( neutral_mafs[g-1,keep] * (1-neutral_mafs[g-1,keep]) / (2*Ne) ))
	neutral_mafs[ g:gens , which(neutral_mafs[g,] < 0) ] = 0
	neutral_mafs[ g:gens , which(neutral_mafs[g,] > 1) ] = 1
	}
	neutral_mafs[ neutral_mafs < 0 ] = 0
	neutral_mafs[ neutral_mafs > 1 ] = 1

	# Selected variants
	selected_mafs = matrix(NA,nrow=gens,ncol=M_selected)
	selected_mafs[1,] = runif(M_selected,0.1,0.9)
	for ( g in 2:gens ) {
	keep = selected_mafs[g-1,] > 0 & selected_mafs[g-1,] < 1
	# Add drift
	prev_maf = selected_mafs[g-1,keep]
	selected_mafs[g,keep] = prev_maf + sprev_maf(1-prev_maf)/(1+2sprev_maf) + rnorm(sum(keep),0,sqrt( prev_maf * (1-prev_maf) / (2*Ne) ))
	selected_mafs[ g:gens , which(selected_mafs[g,] < 0) ] = 0
	selected_mafs[ g:gens , which(selected_mafs[g,] > 1) ] = 1
	}
	selected_mafs[ selected_mafs < 0 ] = 0
	selected_mafs[ selected_mafs > 1 ] = 1

	# set the sampling time (when each aDNA sample was collected)
	sample_time = sample(1:gens,samp_n)

	# generate neutral variants
	# Matrix for the null SNPs
	snp_neutral = matrix(NA,nrow=samp_n,ncol=M)
	for ( i in 1:M ) {
	# for each individual, take the allele frequeny from their sampling time
	snp_neutral[,i] = rbinom(samp_n,2,neutral_mafs[sample_time,i])
	}

	# generate variants under selection
	snp_selected = matrix(NA,nrow=samp_n,ncol=M_selected)
	for ( i in 1:M_selected ) {
	# for each individual, take the allele frequeny from their sampling time
	snp_selected[,i] = rbinom(samp_n,2,selected_mafs[sample_time,i])
	}

	# --- make the neutral snp GRM
	# center and scale the genotypes
	# genos = scale(rbind(snp0,snp1))
	# center and scale using frequencies
	genos = snp_neutral
	mafs = apply(genos,2,mean)/2
	for ( i in 1:M ) {
	genos[,i] = (genos[,i] - 2mafs[i])/sqrt(2mafs[i]*(1-mafs[i]))
	}
	# compute the GRM
	grm = genos %*% t(genos) / M
	# add a small ridge to make this matrix non-singular
	grm = 0.9 * grm + 0.1 * diag(nrow(grm))
	# As a sanity check, compute PCs
	eigenvec = prcomp( grm )$rotation

	# check correlation with sampling time
	cat("Correlation w/ PC1",cor(eigenvec[,1],sample_time),'\n')

	# Selection scan
	test_population_time = sample_time + rnorm(length(sample_time),0,sqrt(0.1*var(sample_time)))

	z_lm = rep(NA,M_selected)
	z_glmm = rep(NA,M_selected)

	# --- Now test the SNPs under selection
	# Selection coefficients
	for ( i in 1:M_selected ) {
	test_snp_alleles = matrix(snp_selected[,i],nrow=1)
	rownames(test_snp_alleles) = "snp1"
	test_total_alleles = matrix(rep(2,length(test_snp_alleles)),nrow=1)

	# test with a standard LM (should be inflated by drift)
	reg = summary(lm( test_population_time ~ snp_selected[,i] ))
	z_lm[i] = reg$coef[2,3]

	if ( RUN_GLMM ) {
	# test with a GLMM
	# hide outputs
	reg = quiet(pqlseq2( Y=test_snp_alleles , x=test_population_time , K=grm , lib_size = test_total_alleles , filter=F , verbose=F , model="BMM" , check_K=T , fix_h2eq1=T ))
	# Same run but with the original pqlseq
	# reg = quiet(pqlseq( RawCountDataSet=test_snp_alleles , Phenotypes=test_population_time , RelatednessMatrix=grm , LibSize = test_total_alleles , filtering=F , fit.model="BMM" ))
	if ( is.na(reg$h2) \|\| reg$h2 == 0 \|\| reg$converged == FALSE ) {
	z_glmm[i] = NA
	} else {
	z_glmm[i] = reg$beta / reg$se_beta
	}
	}
	}

	if ( RUN_LMM ) {
	# Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it
	rownames(snp_selected) = paste("sample_",1:nrow(snp_selected),sep='')
	colnames(snp_selected) = paste("snp_",1:ncol(snp_selected),sep='')
	write.table( t(snp_selected) , row.names=T, col.names=F, sep='\t' , quote=F , file="tmp.geno.txt")
	gmmat_df = data.frame("disease" = test_population_time , "id" = rownames(snp_selected) )
	rownames(grm) = rownames(snp_selected)
	colnames(grm) = rownames(snp_selected)
	# Run GMMAT analysis
	reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , is.dosage=TRUE, snps=colnames(snp_selected) , verbose=FALSE )
	z_lmm = reg$BETA / reg$SE
	} else {
	z_lmm = NA
	}

	if ( RUN_LMM2 ) {
	# Prepare data for GMMAT/LMM analysis, note a temp file has to be saved here for the method to load it
	z_lmm2 = rep(NA,ncol(snp_selected))
	rownames(snp_selected) = paste("sample_",1:nrow(snp_selected),sep='')
	colnames(snp_selected) = paste("snp_",1:ncol(snp_selected),sep='')
	rownames(grm) = rownames(snp_selected)
	colnames(grm) = rownames(snp_selected)

	time_pseudo_snp = 2*(test_population_time - min(test_population_time)) / (max(test_population_time) - min(test_population_time))
	write.table( t(time_pseudo_snp) , row.names="snp", col.names=F, sep='\t' , quote=F , file="tmp.geno.txt")
	for ( i in 1:ncol(snp_selected) ) {
	gmmat_df = data.frame("disease" = scale(snp_selected[,i]) , "id" = rownames(snp_selected) )
	# Run GMMAT analysis
	reg = glmm.wald( fixed = disease ~ 1 , data = gmmat_df , id = "id" , kins = grm , is.dosage=TRUE , family = gaussian(link = "identity") , infile = "tmp.geno.txt" , snps="snp" , verbose=FALSE )
	z_lmm2[i] = reg$BETA / reg$SE
	}
	} else {
	z_lmm2 = NA
	}

	cat("GLMM NaN Fail",mean(is.na(z_glmm)),'\n')
	cat("Power LM",mean(z_lm^2 > 1.96^2),'\n')
	cat("Power GLMM (geno ~ time)",mean(z_glmm^2 > 1.96^2,na.rm=T),'\n')
	cat("Power LMM (geno ~ time)",mean(z_lmm2^2 > 1.96^2),'\n')
	cat("Power LMM (time ~ geno)",mean(z_lmm^2 > 1.96^2),'\n')
	cat( paste(mean(z_lm^2 > 1.96^2) , mean(z_glmm^2 > 1.96^2,na.rm=T) , mean(z_lmm2^2 > 1.96^2) , mean(z_lmm^2 > 1.96^2) ,sep=',') , '\n' )
	}
	}

	# Expected Output
	# Simulating Ne: 1000 and s: 0
	# Correlation w/ PC1 -0.9767579
	# GLMM NaN Fail 0.29
	# Power LM 0.53
	# Power GLMM (geno ~ time) 0.4788732
	# Power LMM (geno ~ time) 0.34
	# Power LMM (time ~ geno) 0.115
	# 0.53,0.47887323943662,0.34,0.115
	#
	# Simulating Ne: 1000 and s: 0.002
	# Correlation w/ PC1 0.9802674
	# GLMM NaN Fail 0.365
	# Power LM 0.68
	# Power GLMM (geno ~ time) 0.5433071
	# Power LMM (geno ~ time) 0.48
	# Power LMM (time ~ geno) 0.135
	# 0.68,0.543307086614173,0.48,0.135
	#
	# Simulating Ne: 10000 and s: 0
	# Correlation w/ PC1 -0.7918637
	# GLMM NaN Fail 0.53
	# Power LM 0.155
	# Power GLMM (geno ~ time) 0.1595745
	# Power LMM (geno ~ time) 0.145
	# Power LMM (time ~ geno) 0.06
	# 0.155,0.159574468085106,0.145,0.06
	#
	# Simulating Ne: 10000 and s: 0.002
	# Correlation w/ PC1 0.8033265
	# GLMM NaN Fail 0.455
	# Power LM 0.525
	# Power GLMM (geno ~ time) 0.440367
	# Power LMM (geno ~ time) 0.5
	# Power LMM (time ~ geno) 0.305
	# 0.525,0.440366972477064,0.5,0.305
	#
	# Simulating Ne: 1e+09 and s: 0
	# Correlation w/ PC1 0.0171917
	# GLMM NaN Fail 0.435
	# Power LM 0.045
	# Power GLMM (geno ~ time) 0.02654867
	# Power LMM (geno ~ time) 0.04
	# Power LMM (time ~ geno) 0.045
	# 0.045,0.0265486725663717,0.04,0.045
	#
	# Simulating Ne: 1e+09 and s: 0.002
	# Correlation w/ PC1 -0.0787399
	# GLMM NaN Fail 0.505
	# Power LM 0.43
	# Power GLMM (geno ~ time) 0.4141414
	# Power LMM (geno ~ time) 0.43
	# Power LMM (time ~ geno) 0.43
	# 0.43,0.414141414141414,0.43,0.43