Created
September 18, 2025 12:55
-
-
Save sashagusev/b92fd98eb9e7a4f0cf93889ab38ac711 to your computer and use it in GitHub Desktop.
Simulation of within-sibling admixture analysis for a diverging trait
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set.seed(42) | |
options(digits=3) | |
# --- Parameters | |
# sibling pairs | |
N = 100e3 | |
# unadmixed individuals (for population estimate) | |
N_unadmixed = 10e3 | |
# number of variants | |
M = 100 | |
# heritability | |
hsq = 0.8 | |
# number of runs per architecture | |
seeds = 5 | |
# frequency in pop1 | |
frq_pop1 = 0.9 | |
# frequency in pop2 | |
frq_pop2 = 0.1 | |
# --- | |
# For storing results | |
all_1cv_est = vector() | |
all_1cv_true = vector() | |
all_100cv_est = vector() | |
all_100cv_true = vector() | |
# Iterate across increasing levels of frequency differentiation | |
for ( frq_pop1 in seq(0.2,0.9,by=0.1) ) { | |
# For reference, calculate the expected phenotype difference based on allele frequencies and heritability | |
frq_mean = (frq_pop1 + frq_pop2)/2 | |
expected_diff = 2 * ( frq_pop1 - frq_pop2 ) * sqrt ( hsq / (2*frq_mean*(1-frq_mean)) ) | |
cat("p1" , frq_pop1 , "p2" , frq_pop2, "Expected absolute effect", expected_diff,'\n') | |
cat( "# causal" , "Pop Effect", "Effect SE" , "WF effect" , "WF SE", "BF effect" , "BF effect" , "BF SE\n", sep='\t' ) | |
# Iterate over different numbers of causal variants | |
for ( M_causal in c(1,100) ) { | |
# for storing results | |
est_wf = rep(NA,seeds) | |
est_bf = rep(NA,seeds) | |
est_pop = rep(NA,seeds) | |
# - | |
for ( s in 1:seeds ) { | |
# Sample causal variants | |
i_causal = sample(M,M_causal) | |
# Default allele frequency | |
maf = 0.5 | |
# sample 0/1 haplotypes for parents | |
mate1_hap_a = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M ) | |
mate1_hap_b = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M ) | |
mate2_hap_a = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M ) | |
mate2_hap_b = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M ) | |
# sample 0/1 ancestry for parents | |
mate1_anc_a = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M ) | |
mate1_anc_b = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M ) | |
mate2_anc_a = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M ) | |
mate2_anc_b = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M ) | |
# differentiate the causal variants between populations | |
if ( TRUE ) { | |
for ( i in i_causal ) { | |
# set population 1 to be all carriers | |
mate1_hap_a[mate1_anc_a[,i] == 1,i] = rbinom(sum(mate1_anc_a[,i] == 1),1,frq_pop1) | |
mate1_hap_b[mate1_anc_b[,i] == 1,i] = rbinom(sum(mate1_anc_b[,i] == 1),1,frq_pop1) | |
mate2_hap_a[mate2_anc_a[,i] == 1,i] = rbinom(sum(mate2_anc_a[,i] == 1),1,frq_pop1) | |
mate2_hap_b[mate2_anc_b[,i] == 1,i] = rbinom(sum(mate2_anc_b[,i] == 1),1,frq_pop1) | |
# set population 0 to be all non-carriers | |
mate1_hap_a[mate1_anc_a[,i] == 0,i] = rbinom(sum(mate1_anc_a[,i] == 0),1,frq_pop2) | |
mate1_hap_b[mate1_anc_b[,i] == 0,i] = rbinom(sum(mate1_anc_b[,i] == 0),1,frq_pop2) | |
mate2_hap_a[mate2_anc_a[,i] == 0,i] = rbinom(sum(mate2_anc_a[,i] == 0),1,frq_pop2) | |
mate2_hap_b[mate2_anc_b[,i] == 0,i] = rbinom(sum(mate2_anc_b[,i] == 0),1,frq_pop2) | |
} | |
} | |
# --- generate sibling 1 | |
# hap a | |
child1_hap_a = mate1_hap_a | |
child1_anc_a = mate1_anc_a | |
sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M ) | |
child1_hap_a[ !sampled_vars ] = mate1_hap_b[ !sampled_vars ] | |
child1_anc_a[ !sampled_vars ] = mate1_anc_b[ !sampled_vars ] | |
# hap b | |
child1_hap_b = mate2_hap_a | |
child1_anc_b = mate2_anc_a | |
sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M ) | |
child1_hap_b[ !sampled_vars ] = mate2_hap_b[ !sampled_vars ] | |
child1_anc_b[ !sampled_vars ] = mate2_anc_b[ !sampled_vars ] | |
# --- generate sibling 2 | |
# hap a | |
child2_hap_a = mate1_hap_a | |
child2_anc_a = mate1_anc_a | |
sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M ) | |
child2_hap_a[ !sampled_vars ] = mate1_hap_b[ !sampled_vars ] | |
child2_anc_a[ !sampled_vars ] = mate1_anc_b[ !sampled_vars ] | |
# hap b | |
child2_hap_b = mate2_hap_a | |
child2_anc_b = mate2_anc_a | |
sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M ) | |
child2_hap_b[ !sampled_vars ] = mate2_hap_b[ !sampled_vars ] | |
child2_anc_b[ !sampled_vars ] = mate2_anc_b[ !sampled_vars ] | |
# --- add some people with 100% ancestry for a more conventional analysis (same code as before) | |
# sample 0/1 haplotypes for parents | |
mate1_hap_a = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M ) | |
mate1_hap_b = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M ) | |
mate2_hap_a = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M ) | |
mate2_hap_b = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M ) | |
# 100% ancestry | |
mate1_anc_a = matrix( 1 , nrow=N_unadmixed , ncol=M ) | |
mate1_anc_b = matrix( 1 , nrow=N_unadmixed , ncol=M ) | |
mate2_anc_a = matrix( 0 , nrow=N_unadmixed , ncol=M ) | |
mate2_anc_b = matrix( 0 , nrow=N_unadmixed , ncol=M ) | |
# fix the causal variants between populations | |
for ( i in i_causal ) { | |
# set population 1 to be all carriers | |
mate1_hap_a[mate1_anc_a[,i] == 1,i] = rbinom(sum(mate1_anc_a[,i] == 1),1,frq_pop1) | |
mate1_hap_b[mate1_anc_b[,i] == 1,i] = rbinom(sum(mate1_anc_b[,i] == 1),1,frq_pop1) | |
mate2_hap_a[mate2_anc_a[,i] == 1,i] = rbinom(sum(mate2_anc_a[,i] == 1),1,frq_pop1) | |
mate2_hap_b[mate2_anc_b[,i] == 1,i] = rbinom(sum(mate2_anc_b[,i] == 1),1,frq_pop1) | |
# set population 0 to be all non-carriers | |
mate1_hap_a[mate1_anc_a[,i] == 0,i] = rbinom(sum(mate1_anc_a[,i] == 0),1,frq_pop2) | |
mate1_hap_b[mate1_anc_b[,i] == 0,i] = rbinom(sum(mate1_anc_b[,i] == 0),1,frq_pop2) | |
mate2_hap_a[mate2_anc_a[,i] == 0,i] = rbinom(sum(mate2_anc_a[,i] == 0),1,frq_pop2) | |
mate2_hap_b[mate2_anc_b[,i] == 0,i] = rbinom(sum(mate2_anc_b[,i] == 0),1,frq_pop2) | |
} | |
# --- merge the admixed and unadmixed samples and label them | |
all_genos = rbind(child1_hap_a+child1_hap_b,child2_hap_a+child2_hap_b,mate1_hap_a+mate1_hap_b,mate2_hap_a+mate2_hap_b) | |
all_anc = rbind(child1_anc_a+child1_anc_b,child2_anc_a+child2_anc_b,mate1_anc_a+mate1_anc_b,mate2_anc_a+mate2_anc_b) | |
all_global_anc = apply( all_anc , 1 , sum ) / (2*M) | |
type = c( rep("siba",N) , rep("sibb",N) , rep("pop1",N_unadmixed) , rep("pop0",N_unadmixed) ) | |
# --- generate heritable phenotype in the entire population with a random environment | |
b_causal = rnorm(M_causal,0,1) | |
all_y = scale( all_genos[,i_causal,drop=F] %*% b_causal )*sqrt(hsq) + rnorm(nrow(all_genos),0,sqrt(1-hsq)) | |
# Run the regression analysis | |
# --- joint regression (ala Wang et al.) | |
fam_mean = (all_global_anc[type=="siba"] + all_global_anc[type=="sibb"])/2 | |
# sibling regression | |
reg_wf = summary(lm( (all_y[type=="siba"]) ~ (all_global_anc[type=="siba"]) + (fam_mean) )) | |
# pop regression without admixture | |
reg_pop = summary(lm(all_y[(type=="pop1" | type=="pop0")] ~ all_global_anc[(type=="pop1" | type=="pop0")] )) | |
#est_wf[s] = reg$coef[2,1] | |
#est_bf[s] = reg$coef[3,1] | |
#est_pop[s] = reg$coef[2,1] | |
cat( M_causal , "\t" , reg_pop$coef[2,1] , "\t(" , reg_pop$coef[2,2] , ")\t" , reg_wf$coef[2,1] , "\t(" , reg_wf$coef[2,2] , ")\t" , reg_wf$coef[3,1] , "\t(" , reg_wf$coef[3,2] , ")\n" , sep='' ) | |
# store all of the results for plotting | |
if ( M_causal == 1 ) { | |
all_1cv_est = c(all_1cv_est,reg_wf$coef[2,1]) | |
all_1cv_true = c(all_1cv_true,reg_pop$coef[2,1]) | |
} else if ( M_causal == 100 ) { | |
all_100cv_est = c(all_100cv_est,reg_wf$coef[2,1]) | |
all_100cv_true = c(all_100cv_true,reg_pop$coef[2,1]) | |
} | |
} | |
} | |
} | |
# Visualize the results from the two architectures | |
clr = c("#d53e4f","#fc8d59") | |
plot( all_1cv_true , all_1cv_est , xlim=c(-2.5,2.5) , ylim=c(-2.5,2.5) , type="n" , las=1 , bty="n" , xlab="True population difference" , ylab="Estimated Within-Family Effect") | |
abline(0,1,lty=2) | |
points( all_1cv_true , all_1cv_est , pch=19 , col=paste(clr[1],"50",sep='') ) | |
points( all_1cv_true , all_1cv_est , col=clr[1] ) | |
reg1 = (lm(all_1cv_est~all_1cv_true)) | |
abline(reg1,col=clr[1],lty=3) | |
points( all_100cv_true , all_100cv_est , pch=19 , col=paste(clr[2],"50",sep='')) | |
points( all_100cv_true , all_100cv_est , col=clr[2]) | |
reg100 = (lm(all_100cv_est~all_100cv_true)) | |
abline(reg100,col=clr[2],lty=3) | |
legend("topleft",pch=19,col=clr,legend=c(paste("1 causal (slope=",round(reg1$coef[2],2)," s.e. ",round(summary(reg1)$coef[2,2],2),")",sep=''),paste("100 causal (slope=",round(reg100$coef[2],2)," s.e. ",round(summary(reg100)$coef[2,2],2),")",sep='')),bty="n",cex=0.75) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment