Skip to content

Instantly share code, notes, and snippets.

@sashagusev
Last active April 1, 2025 01:12
Show Gist options
  • Save sashagusev/11b6d17d1c326fc54f2db5532759002b to your computer and use it in GitHub Desktop.
Save sashagusev/11b6d17d1c326fc54f2db5532759002b to your computer and use it in GitHub Desktop.
set.seed(42)
### --- REML
# Original code:
# https://github.com/sashagusev/SKK-REML-sim/blob/master/func_reml.R
library('msm')
# Utility function for calculating log of determinant
`logdet` <-
function(p) {
det_l <- determinant(p,log=T)
if ( det_l$sign[1] == 1 ) return(det_l$modulus[1])
else return(det_l$modulus[1] * -1);
}
# Average-Information ML (no fixed effects) until convergence
# A = list of GRM
# y = vector of phenotype entries
# Var = initial variance components (percent)
`aiML` <-
function( A, y , Var , verbose=TRUE ){
r <- length(A) + 1
N <- length(y)
# Add matrix of residuals to A
A[[r]] <- diag(N)
AI <- matrix(0, ncol=r, nrow=r)
S <- matrix(0, ncol=r, nrow=r)
s <- matrix(0, ncol=1, nrow=r)
l_dif <- 10
it <- 0
Var <- c(var(y)) * Var
# Perform a single iteration of EM-based REML to initiate parameters
if(verbose) cat("Performing a single iteration of EM-REML\n")
V <- 0
for ( i in 1:r ) V <- V + A[[i]] * Var[i]
Vinv <- solve(V)
P <- Vinv
if(verbose)
cat("Prior values from EM-REML:")
for ( i in 1:r ) {
Var[i] <- (Var[i]^2 * t(y) %*% P %*% A[[i]] %*% P %*% y + sum(diag(Var[i]*diag(N) - Var[i]^2 * P %*% A[[i]])) )/N
if(verbose)
cat(" ",Var[i],sep='')
}
if(verbose)
cat('\n')
V <- 0
for ( i in 1:r ) V <- V + A[[i]] * Var[i]
Vinv <- solve(V)
P <- Vinv
logL <- -0.5 * ( logdet(V) + t(y) %*% P %*% y )
if(verbose)
cat ("EM:\t",logL,'\n',sep='')
# Iterate AI REML until convergence
# while ( abs(l_dif) >= 10^-4 & it < 100 ){
# ** GCTA style:
while ( it < 100 & ( abs(l_dif) >= 10^-4 | (abs(l_dif)<10^-2 & l_dif < 0)) ){
it <- it + 1
# Average information matrix
for ( i in 1:r ) {
for( ii in 1:r ) {
if ( i == r & ii == r ) AI[r,r] <- t(y) %*% P %*% P %*% P %*% y
else if ( i == r ) AI[r,ii] <- t(y) %*% P %*% P %*% A[[ii]] %*% P %*% y
else if ( ii == r ) AI[i,r] <- t(y) %*% P %*% A[[i]] %*% P %*% P %*% y
else AI[i,ii] <- t(y) %*% P %*% A[[i]] %*% P %*% A[[ii]] %*% P %*% y
}
}
AI <- 0.5*AI
# Vector of first derivatives of log likelihood function
for ( i in 1:r ) {
if ( i == r ) s[r,1] <- sum(diag(( P ))) - ( t(y) %*% P %*% P %*% y )
else s[i,1] <- sum(diag(( P %*% A[[i]] ))) - ( t(y) %*% P %*% A[[i]] %*% P %*% y )
}
s <- -0.5*s
# New variance components from AI and likelihood
# Var <- Var + solve(AI) %*% s
# ** GCTA style:
if ( l_dif > 1 ) Var <- Var + 0.316*(solve(AI) %*% s)
else Var <- Var + solve(AI) %*% s
# Re-calculate V and P matrix
V <- 0
for ( i in 1:r ) V <- V + A[[i]] * Var[i]
Vinv <- solve(V)
P <- Vinv
# Likelihood of the MLM (ignoring constants)
new_logL <- -0.5 * ( logdet(V) + t(y) %*% P %*% y )
l_dif <- new_logL - logL
logL <- new_logL
if(verbose) {
cat(it,'\t',logL,sep='')
for( i in 1:r ) cat( '\t',Var[i],sep='' )
cat('\n')
}
}
if(verbose)
cat('\n')
# Calculate matrix for standard errors (same as AI matrix but w/out y)
for( i in 1:r ) {
for ( ii in 1:r ) {
S[i,ii] <- sum(diag(P %*% A[[i]] %*% P %*% A[[ii]] ))
}
}
S <- 0.5*S
Sinv <- solve(S)
if(verbose){
for( i in 1:r ) cat( "V(G",i,")\t",Var[i],'\t',sqrt(Sinv[i,i]),'\n',sep="")
}
# Construct string equation for delta method "~x1+x2 ... +xn"
SE.eq <- "~"
for(i in 1:r) {
if ( i == 1 ) SE.eq <- paste(SE.eq,"x",i,sep='')
else SE.eq <- paste(SE.eq,"+x",i,sep='')
}
SE.p <- deltamethod(as.formula(SE.eq),Var,Sinv,ses=T)
if( verbose )
cat( "Vp\t",sum(Var),'\t',SE.p,'\n',sep="")
SE.i <- rep(0,r)
for( i in 1:r ) {
# Construct string equation for delta method "~xi/(x2 ... +xn)"
SE.eq <- paste("~x",i,"/(",sep='')
for( ii in setdiff(1:r,i) ) {
if ( ii == setdiff(1:r,i)[1] ) SE.eq <- paste (SE.eq,"x",i,sep='')
SE.eq <- paste (SE.eq,"+x",ii,sep='')
}
SE.eq <- paste(SE.eq,")",sep='')
SE.i[i] <- deltamethod(as.formula(SE.eq),Var,Sinv,ses=T)
if(verbose)
cat( "V(G",i,")/Vp\t",Var[i]/sum(Var),'\t',SE.i[i],'\n',sep='')
}
return( list( "h2" = Var[1]/sum(Var) , "se" = SE.i[1] ))
}
### ----
# Actual Simulations
### ----
# Num individuals
N = 1e3
# Num markers
M = 100
# Minor allele frequency
maf = 0.5
# Direct variance in the first generation
var_direct = 0.6
# Variance of the parental mean phenotype effect
var_indirect = 0.3
# Phenotypic mate correlation for assortment
AM_cor = 0.5
# Number of simulations to run
seeds = 10
# Whether to run REML (slow!)
RUN_REML = TRUE
# ---
### Vectors for storing results
# REML results
results_rdr_reml = rep(NA,seeds)
# HE results
results_rdr_he = rep(NA,seeds)
# Twin results
results_hsq_twin = rep(NA,seeds)
# PGI results
results_pgi_direct = rep(NA,seeds)
# Phenotypic variance
results_var_pheno = rep(NA,seeds)
# Genetic correlation of direct effects in parents
mate_rg = rep(NA,seeds)
for ( s in 1:seeds ) {
cat("Iteration ",s,'\n')
params = c(s , N,M,AM_cor,var_direct,var_indirect)
# ---
# sample effect sizes for direct effect (follow Young et al. and just sample from std norm)
betas = rnorm(M,0,sqrt(var_direct/M))
# sample 0/1 haplotypes
mate1_hap_a = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
mate1_hap_b = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
# sample a random mate
mate2_hap_a = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
mate2_hap_b = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
# compute genetic value for direct phenotype
gv_mate1_direct = scale( mate1_hap_a + mate1_hap_b ) %*% betas
gv_mate2_direct = scale( mate2_hap_a + mate2_hap_b ) %*% betas
# generate phenotype
pheno_mate1 = (gv_mate1_direct)
pheno_mate2 = (gv_mate2_direct)
# compute residual variance in the un-assorted population
# we will use this as the environmental variance going forward
resid_var = 1-var(c(pheno_mate1,pheno_mate2))
# add residual variance to sum up to 1
pheno_mate1 = pheno_mate1 + rnorm(N,0,sqrt(resid_var))
pheno_mate2 = pheno_mate2 + rnorm(N,0,sqrt(resid_var))
## --- Assortative Mating until equilibrium
if ( AM_cor > 0 ) {
for ( gens in 1:20 ) {
# rank the phenotypes and find similar ranks
if ( AM_cor != 0 ) {
ord1 = order(pheno_mate1)
ord2 = order( AM_cor * scale(pheno_mate2) + rnorm(N,0,sqrt(1-AM_cor^2)))
} else {
ord1 = sample(1:N)
ord2 = sample(1:N)
}
mate1_hap_a = mate1_hap_a[ord1,]
mate1_hap_b = mate1_hap_b[ord1,]
mate2_hap_a = mate2_hap_a[ord2,]
mate2_hap_b = mate2_hap_b[ord2,]
pheno_mate1 = pheno_mate1[ord1]
pheno_mate2 = pheno_mate2[ord2]
# --- mate and generate kids
child1_hap_a = mate1_hap_a
sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
child1_hap_a[ !sampled_vars ] = mate1_hap_b[ !sampled_vars ]
child1_hap_b = mate2_hap_a
sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
child1_hap_b[ !sampled_vars ] = mate2_hap_b[ !sampled_vars ]
# generate phenotype
# - direct effect
gv_child1_direct = scale( child1_hap_a + child1_hap_b ) %*% betas
# - parental effect
child1_indirect = (pheno_mate1 + pheno_mate2)/2
# - final phenotype
pheno_child1 = (gv_child1_direct) + sqrt(var_indirect) * scale(child1_indirect) + rnorm(N,0,sqrt(resid_var))
# --- find new mate and generate kids
if ( AM_cor != 0 ) {
ord1 = order(pheno_mate1)
ord2 = order( AM_cor * scale(pheno_mate2) + rnorm(N,0,sqrt(1-AM_cor^2)))
} else {
ord1 = sample(1:N)
ord2 = sample(1:N)
}
mate1_hap_a = mate1_hap_a[ord1,]
mate1_hap_b = mate1_hap_b[ord1,]
mate2_hap_a = mate2_hap_a[ord2,]
mate2_hap_b = mate2_hap_b[ord2,]
pheno_mate1 = pheno_mate1[ord1]
pheno_mate2 = pheno_mate2[ord2]
child2_hap_a = mate1_hap_a
sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
child2_hap_a[ !sampled_vars ] = mate1_hap_b[ !sampled_vars ]
child2_hap_b = mate2_hap_a
sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
child2_hap_b[ !sampled_vars ] = mate2_hap_b[ !sampled_vars ]
# generate phenotype
# - direct effect
gv_child2_direct = scale( child2_hap_a + child2_hap_b ) %*% betas
# - parental effect
child2_indirect = (pheno_mate1 + pheno_mate2)/2
# - final phenotype
pheno_child2 = (gv_child2_direct) + sqrt(var_indirect) * scale(child2_indirect) + rnorm(N,0,sqrt(resid_var))
# update to the next generation
mate1_hap_a = child1_hap_a
mate1_hap_b = child1_hap_b
mate2_hap_a = child2_hap_a
mate2_hap_b = child2_hap_b
pheno_mate1 = pheno_child1
pheno_mate2 = pheno_child2
}
}
### ----
# Now make the final generation of children to be tested
### ----
# Rank the phenotypes and find similar ranks
if ( AM_cor != 0 ) {
ord1 = order(pheno_mate1)
ord2 = order( AM_cor * scale(pheno_mate2) + rnorm(N,0,sqrt(1-AM_cor^2)))
} else {
ord1 = sample(1:N)
ord2 = sample(1:N)
}
mate1_hap_a = mate1_hap_a[ord1,]
mate1_hap_b = mate1_hap_b[ord1,]
mate2_hap_a = mate2_hap_a[ord2,]
mate2_hap_b = mate2_hap_b[ord2,]
pheno_mate1 = pheno_mate1[ord1]
pheno_mate2 = pheno_mate2[ord2]
# *** Record the genetic mate correlation of the parents (r_g)
mate_rg[s] = cor(( mate1_hap_a + mate1_hap_b ) %*% betas , (mate2_hap_a + mate2_hap_b ) %*% betas)
# --- generate one child per family
child_hap_a = mate1_hap_a
sampled_vars_mate1 = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
child_hap_a[ !sampled_vars_mate1 ] = mate1_hap_b[ !sampled_vars_mate1 ]
child_hap_b = mate2_hap_a
sampled_vars_mate2 = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
child_hap_b[ !sampled_vars_mate2 ] = mate2_hap_b[ !sampled_vars_mate2 ]
# generate their phenotype
gv_child_direct = scale( child_hap_a + child_hap_b ) %*% betas
# also store the parental mean for PGI analysis at the end
gv_child_indirect = scale( mate1_hap_a + mate1_hap_b + mate2_hap_a + mate2_hap_b ) %*% betas
# phenotype = direct + scaled parental phenotype + residual variance
pheno_child = (gv_child_direct) + sqrt(var_indirect) * scale((pheno_mate1 + pheno_mate2)/2) + rnorm(N,0,sqrt(resid_var))
# *** Record the phenotypic variance
results_var_pheno[s] = var(pheno_child)
# --- generate twins
# MZs only have E noise
y_MZ1 = (gv_child_direct) + sqrt(var_indirect) * scale((pheno_mate1 + pheno_mate2)/2) + rnorm(N,0,sqrt(resid_var))
y_MZ2 = (gv_child_direct) + sqrt(var_indirect) * scale((pheno_mate1 + pheno_mate2)/2) + rnorm(N,0,sqrt(resid_var))
# generate a new DZ
y_DZ1 = (gv_child_direct) + sqrt(var_indirect) * scale((pheno_mate1 + pheno_mate2)/2) + rnorm(N,0,sqrt(resid_var))
# generate their sibling
sib_hap_a = mate1_hap_a
sampled_vars_mate1 = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
sib_hap_a[ !sampled_vars_mate1 ] = mate1_hap_b[ !sampled_vars_mate1 ]
sib_hap_b = mate2_hap_a
sampled_vars_mate2 = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
sib_hap_b[ !sampled_vars_mate2 ] = mate2_hap_b[ !sampled_vars_mate2 ]
# compute genetic value
gv_sib_direct = scale( sib_hap_a + sib_hap_b ) %*% betas
y_DZ2 = (gv_sib_direct) + sqrt(var_indirect) * scale((pheno_mate1 + pheno_mate2)/2) + rnorm(N,0,sqrt(resid_var))
### ----
# Estimate Parameters
### ----
# *** Record Falconer estimate from twins
results_hsq_twin[s] = 2*(cor(y_MZ1,y_MZ2) - cor(y_DZ1,y_DZ2))
cat( params , "Twin A^2" , results_hsq_twin[s] , '\n' , sep='\t')
# Compute kinship matrices for RDR
x_child = scale( child_hap_a + child_hap_b )
k_child = x_child %*% t(x_child) / ncol(x_child)
x_parents = sqrt(2) * scale( mate1_hap_a + mate1_hap_b + mate2_hap_a + mate2_hap_b )
k_par = x_parents %*% t(x_parents) / (2*ncol(x_parents))
k_opar = ( x_child %*% t(x_parents) + x_parents %*% t(x_child) ) / (2*ncol(x_parents))
# *** Record RDR REML results
if ( RUN_REML == TRUE ) {
K = list()
K[[1]] = k_child
K[[2]] = k_par
K[[3]] = k_opar
# REML RDR
try({
reml = aiML( A = K , y = scale(pheno_child) , Var = c(0.25,0.25,0.25,0.25) , verbose=FALSE )
results_rdr_reml[s] = reml$h2
cat( params , "RDR_REML_direct" , results_rdr_reml[s] , '\n' , sep='\t')
},silent=F)
}
# Vectorize the matrices for HE-regression
yprod_child = scale( pheno_child ) %*% t( scale( pheno_child ) )
yvals = yprod_child[ lower.tri(yprod_child) ]
kvals = k_child[ lower.tri(k_child) ]
kvals_par = k_par[ lower.tri(k_par) ]
kvals_opar = k_opar[ lower.tri(k_opar) ]
# *** Record HE-regression REML results
results_rdr_he[s] = lm( yvals ~ kvals + kvals_par + kvals_opar )$coef[2]
cat( params , "RDR_HE_direct" , results_rdr_he[s] , '\n' , sep='\t')
# *** Record the PGI estimate of direct effects
results_pgi_direct[s] = summary(lm( scale(pheno_child) ~ scale(gv_child_direct) + scale(gv_child_indirect) ))$coef[2,1]^2
cat( params , "PGI_direct" , results_pgi_direct[s] , '\n' , sep='\t')
}
options(digits=3)
cat("\n\n### Final Average:\n" )
cat( "Direct Var_g at t0:" , var_direct , '\n' , sep='\t' )
cat( "*** Results at equilibrium:\n")
cat( "Mean Var_y:", mean(results_var_pheno,na.rm=T) , sd(results_var_pheno,na.rm=T) / sqrt(sum(!is.na(results_var_pheno))) , '\n' ,sep='\t' )
cat( "Direct PGI Var_g:" , mean(results_pgi_direct*results_var_pheno,na.rm=T) , sd(results_pgi_direct*results_var_pheno,na.rm=T) / sqrt(sum(!is.na(results_pgi_direct*results_var_pheno))) , '\n' ,sep='\t' )
cat( "Direct PGI hsq:" , mean(results_pgi_direct,na.rm=T) , sd(results_pgi_direct,na.rm=T) / sqrt(sum(!is.na(results_pgi_direct))) , '\n' ,sep='\t' )
cat( "^^^ This is the truth\n\n")
cat( "RDR HE Var_g:" , mean(results_rdr_he*results_var_pheno,na.rm=T) , sd(results_rdr_he*results_var_pheno,na.rm=T) / sqrt(sum(!is.na(results_rdr_he*results_var_pheno))) , '\n' ,sep='\t' )
cat( "RDR HE hsq:" , mean(results_rdr_he,na.rm=T) , sd(results_rdr_he,na.rm=T) / sqrt(sum(!is.na(results_rdr_he))) , '\n' ,sep='\t' )
h2eq = (1 - sqrt(1 - 4 * AM_cor * results_rdr_he))/(2 * AM_cor)
cat( "RDR HE hsq corrected (Kemper):" , mean(h2eq,na.rm=T) , sd(h2eq,na.rm=T) / sqrt(sum(!is.na(h2eq))) , '\n' ,sep='\t' )
h2eq = results_rdr_he / (1 - mate_rg)
cat( "RDR HE hsq corrected (r_g):" , mean(h2eq,na.rm=T) , sd(h2eq,na.rm=T) / sqrt(sum(!is.na(h2eq))) , '\n\n' ,sep='\t' )
cat( "Twin Var_g:" , mean(results_hsq_twin*results_var_pheno,na.rm=T) , sd(results_hsq_twin*results_var_pheno,na.rm=T) / sqrt(sum(!is.na(results_hsq_twin*results_var_pheno))) , '\n' ,sep='\t' )
cat( "Twin hsq:" , mean(results_hsq_twin,na.rm=T) , sd(results_hsq_twin,na.rm=T) / sqrt(sum(!is.na(results_hsq_twin))) , '\n' ,sep='\t' )
h2eq = (1 - sqrt(1 - 4 * AM_cor * results_hsq_twin))/(2 * AM_cor)
cat( "Twin hsq corrected (Kemper):" , mean(h2eq,na.rm=T) , sd(h2eq,na.rm=T) / sqrt(sum(!is.na(h2eq))) , '\n' ,sep='\t' )
h2eq = results_hsq_twin / (1 - mate_rg)
cat( "Twin hsq corrected (r_g):" , mean(h2eq,na.rm=T) , sd(h2eq,na.rm=T) / sqrt(sum(!is.na(h2eq))) , '\n\n' ,sep='\t' )
cat( "RDR REML Var_g:" , mean(results_rdr_reml*results_var_pheno,na.rm=T) , sd(results_rdr_reml*results_var_pheno,na.rm=T) / sqrt(sum(!is.na(results_rdr_reml*results_var_pheno))) , '\n' ,sep='\t' )
cat( "RDR REML hsq:" , mean(results_rdr_reml,na.rm=T) , sd(results_rdr_reml,na.rm=T) / sqrt(sum(!is.na(results_rdr_reml))) , '\n' ,sep='\t' )
h2eq = (1 - sqrt(1 - 4 * AM_cor * results_rdr_reml))/(2 * AM_cor)
cat( "RDR REML hsq corrected (Kemper):" , mean(h2eq,na.rm=T) , sd(h2eq,na.rm=T) / sqrt(sum(!is.na(h2eq))) , '\n' ,sep='\t' )
h2eq = results_rdr_reml / (1 - mate_rg)
cat( "RDR REML hsq corrected (r_g):" , mean(h2eq,na.rm=T) , sd(h2eq,na.rm=T) / sqrt(sum(!is.na(h2eq))) , '\n' ,sep='\t' )
# Expected Output:
# ### Final Average:
# Direct Var_g at t0: 0.6
# *** Results at equilibrium:
# Mean Var_y: 2.44 0.0399
# Direct PGI Var_g: 0.902 0.0571
# Direct PGI hsq: 0.368 0.0186
# ^^^ This is the truth
#
# RDR HE Var_g: 0.593 0.0752
# RDR HE hsq: 0.241 0.0281
# RDR HE hsq corrected (Kemper): 0.29 0.0393
# RDR HE hsq corrected (r_g): 0.401 0.0505
#
# Twin Var_g: 0.598 0.0429
# Twin hsq: 0.244 0.0147
# Twin hsq corrected (Kemper): 0.287 0.0202
# Twin hsq corrected (r_g): 0.403 0.0282
#
# RDR REML Var_g: 0.837 0.0498
# RDR REML hsq: 0.347 0.016
# RDR REML hsq corrected (Kemper): 0.452 0.0288
# RDR REML hsq corrected (r_g): 0.568 0.0355
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment