sashagusev · September 18, 2025 12:55
diff --git a/sibling_admixture.R b/sibling_admixture.R
 set.seed(42)
 options(digits=3)

 # --- Parameters
 # sibling pairs
 N = 100e3
 # unadmixed individuals (for population estimate)
 N_unadmixed = 10e3
 # number of variants
 M = 100
 # heritability
 hsq = 0.8
 # number of runs per architecture
 seeds = 5
 # frequency in pop1
 frq_pop1 = 0.9
 # frequency in pop2
 frq_pop2 = 0.1
 # ---

 # For storing results
 all_1cv_est = vector()
 all_1cv_true = vector()
 all_100cv_est = vector()
 all_100cv_true = vector()

 # Iterate across increasing levels of frequency differentiation
 for ( frq_pop1 in seq(0.2,0.9,by=0.1) ) {
  
  # For reference, calculate the expected phenotype difference based on allele frequencies and heritability
  frq_mean = (frq_pop1 + frq_pop2)/2
  expected_diff = 2 * ( frq_pop1 - frq_pop2 ) * sqrt ( hsq / (2*frq_mean*(1-frq_mean)) )
  cat("p1" , frq_pop1 , "p2" , frq_pop2, "Expected absolute effect", expected_diff,'\n')
  cat( "# causal" , "Pop Effect", "Effect SE" , "WF effect" , "WF SE", "BF effect" , "BF effect" , "BF SE\n", sep='\t' )
  
  # Iterate over different numbers of causal variants
  for ( M_causal in c(1,100) ) {
    # for storing results
    est_wf = rep(NA,seeds)
    est_bf = rep(NA,seeds)
    est_pop = rep(NA,seeds)
    # -
    
    for ( s in 1:seeds ) {
      # Sample causal variants
      i_causal = sample(M,M_causal)
      # Default allele frequency
      maf = 0.5
      # sample 0/1 haplotypes for parents
      mate1_hap_a = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
      mate1_hap_b = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
      mate2_hap_a = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
      mate2_hap_b = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
      # sample 0/1 ancestry for parents
      mate1_anc_a = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M )
      mate1_anc_b = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M )
      mate2_anc_a = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M )
      mate2_anc_b = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M )
      # differentiate the causal variants between populations
      if ( TRUE ) {
        for ( i in i_causal ) {
          # set population 1 to be all carriers
          mate1_hap_a[mate1_anc_a[,i] == 1,i] = rbinom(sum(mate1_anc_a[,i] == 1),1,frq_pop1)
          mate1_hap_b[mate1_anc_b[,i] == 1,i] = rbinom(sum(mate1_anc_b[,i] == 1),1,frq_pop1)
          mate2_hap_a[mate2_anc_a[,i] == 1,i] = rbinom(sum(mate2_anc_a[,i] == 1),1,frq_pop1)
          mate2_hap_b[mate2_anc_b[,i] == 1,i] = rbinom(sum(mate2_anc_b[,i] == 1),1,frq_pop1)
          # set population 0 to be all non-carriers
          mate1_hap_a[mate1_anc_a[,i] == 0,i] = rbinom(sum(mate1_anc_a[,i] == 0),1,frq_pop2)
          mate1_hap_b[mate1_anc_b[,i] == 0,i] = rbinom(sum(mate1_anc_b[,i] == 0),1,frq_pop2)
          mate2_hap_a[mate2_anc_a[,i] == 0,i] = rbinom(sum(mate2_anc_a[,i] == 0),1,frq_pop2)
          mate2_hap_b[mate2_anc_b[,i] == 0,i] = rbinom(sum(mate2_anc_b[,i] == 0),1,frq_pop2)
        }
      }
      
      # --- generate sibling 1
      # hap a
      child1_hap_a = mate1_hap_a
      child1_anc_a = mate1_anc_a
      sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
      child1_hap_a[ !sampled_vars ] = mate1_hap_b[ !sampled_vars ]
      child1_anc_a[ !sampled_vars ] = mate1_anc_b[ !sampled_vars ]
      # hap b
      child1_hap_b = mate2_hap_a
      child1_anc_b = mate2_anc_a
      sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
      child1_hap_b[ !sampled_vars ] = mate2_hap_b[ !sampled_vars ]
      child1_anc_b[ !sampled_vars ] = mate2_anc_b[ !sampled_vars ]
      # --- generate sibling 2
      # hap a
      child2_hap_a = mate1_hap_a
      child2_anc_a = mate1_anc_a
      sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
      child2_hap_a[ !sampled_vars ] = mate1_hap_b[ !sampled_vars ]
      child2_anc_a[ !sampled_vars ] = mate1_anc_b[ !sampled_vars ]
      # hap b
      child2_hap_b = mate2_hap_a
      child2_anc_b = mate2_anc_a
      sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
      child2_hap_b[ !sampled_vars ] = mate2_hap_b[ !sampled_vars ]
      child2_anc_b[ !sampled_vars ] = mate2_anc_b[ !sampled_vars ]
      
      # --- add some people with 100% ancestry for a more conventional analysis (same code as before)
      # sample 0/1 haplotypes for parents
      mate1_hap_a = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M )
      mate1_hap_b = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M )
      mate2_hap_a = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M )
      mate2_hap_b = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M )
      # 100% ancestry
      mate1_anc_a = matrix( 1 , nrow=N_unadmixed , ncol=M )
      mate1_anc_b = matrix( 1 , nrow=N_unadmixed , ncol=M )
      mate2_anc_a = matrix( 0 , nrow=N_unadmixed , ncol=M )
      mate2_anc_b = matrix( 0 , nrow=N_unadmixed , ncol=M )
      # fix the causal variants between populations
      for ( i in i_causal ) {
        # set population 1 to be all carriers
        mate1_hap_a[mate1_anc_a[,i] == 1,i] = rbinom(sum(mate1_anc_a[,i] == 1),1,frq_pop1)
        mate1_hap_b[mate1_anc_b[,i] == 1,i] = rbinom(sum(mate1_anc_b[,i] == 1),1,frq_pop1)
        mate2_hap_a[mate2_anc_a[,i] == 1,i] = rbinom(sum(mate2_anc_a[,i] == 1),1,frq_pop1)
        mate2_hap_b[mate2_anc_b[,i] == 1,i] = rbinom(sum(mate2_anc_b[,i] == 1),1,frq_pop1)
        # set population 0 to be all non-carriers
        mate1_hap_a[mate1_anc_a[,i] == 0,i] = rbinom(sum(mate1_anc_a[,i] == 0),1,frq_pop2)
        mate1_hap_b[mate1_anc_b[,i] == 0,i] = rbinom(sum(mate1_anc_b[,i] == 0),1,frq_pop2)
        mate2_hap_a[mate2_anc_a[,i] == 0,i] = rbinom(sum(mate2_anc_a[,i] == 0),1,frq_pop2)
        mate2_hap_b[mate2_anc_b[,i] == 0,i] = rbinom(sum(mate2_anc_b[,i] == 0),1,frq_pop2)
      }
      
      # --- merge the admixed and unadmixed samples and label them
      all_genos = rbind(child1_hap_a+child1_hap_b,child2_hap_a+child2_hap_b,mate1_hap_a+mate1_hap_b,mate2_hap_a+mate2_hap_b)
      all_anc = rbind(child1_anc_a+child1_anc_b,child2_anc_a+child2_anc_b,mate1_anc_a+mate1_anc_b,mate2_anc_a+mate2_anc_b)
      all_global_anc = apply( all_anc , 1 , sum ) / (2*M)
      type = c( rep("siba",N) , rep("sibb",N) , rep("pop1",N_unadmixed) , rep("pop0",N_unadmixed) )
      
      # --- generate heritable phenotype in the entire population with a random environment
      b_causal = rnorm(M_causal,0,1)
      all_y = scale( all_genos[,i_causal,drop=F] %*% b_causal )*sqrt(hsq) + rnorm(nrow(all_genos),0,sqrt(1-hsq))
      
      # Run the regression analysis
      # --- joint regression (ala Wang et al.)
      fam_mean = (all_global_anc[type=="siba"] + all_global_anc[type=="sibb"])/2
      # sibling regression
      reg_wf = summary(lm( (all_y[type=="siba"]) ~ (all_global_anc[type=="siba"]) + (fam_mean) ))
  		# pop regression without admixture
  		reg_pop = summary(lm(all_y[(type=="pop1" | type=="pop0")] ~ all_global_anc[(type=="pop1" | type=="pop0")] ))
  		#est_wf[s] = reg$coef[2,1]
  		#est_bf[s] = reg$coef[3,1]
  		#est_pop[s] = reg$coef[2,1]
  		
      cat( M_causal , "\t" , reg_pop$coef[2,1] , "\t(" , reg_pop$coef[2,2] , ")\t" , reg_wf$coef[2,1] , "\t(" , reg_wf$coef[2,2] , ")\t" , reg_wf$coef[3,1] , "\t(" , reg_wf$coef[3,2] , ")\n" , sep='' )
  	  
      # store all of the results for plotting
      if ( M_causal == 1 ) {
        all_1cv_est = c(all_1cv_est,reg_wf$coef[2,1])
        all_1cv_true = c(all_1cv_true,reg_pop$coef[2,1])
      } else if ( M_causal == 100 ) {
        all_100cv_est = c(all_100cv_est,reg_wf$coef[2,1])
        all_100cv_true = c(all_100cv_true,reg_pop$coef[2,1])      
      }
    }
  }
 }

 # Visualize the results from the two architectures
 clr = c("#d53e4f","#fc8d59")
 plot( all_1cv_true , all_1cv_est , xlim=c(-2.5,2.5) , ylim=c(-2.5,2.5) , type="n" , las=1 , bty="n" , xlab="True population difference" , ylab="Estimated Within-Family Effect")
 abline(0,1,lty=2)
 points( all_1cv_true , all_1cv_est , pch=19 , col=paste(clr[1],"50",sep='') )
 points( all_1cv_true , all_1cv_est , col=clr[1] )
 reg1 = (lm(all_1cv_est~all_1cv_true))
 abline(reg1,col=clr[1],lty=3)
 points( all_100cv_true , all_100cv_est , pch=19 , col=paste(clr[2],"50",sep=''))
 points( all_100cv_true , all_100cv_est , col=clr[2])
 reg100 = (lm(all_100cv_est~all_100cv_true))
 abline(reg100,col=clr[2],lty=3)
 legend("topleft",pch=19,col=clr,legend=c(paste("1 causal (slope=",round(reg1$coef[2],2)," s.e. ",round(summary(reg1)$coef[2,2],2),")",sep=''),paste("100 causal (slope=",round(reg100$coef[2],2)," s.e. ",round(summary(reg100)$coef[2,2],2),")",sep='')),bty="n",cex=0.75)
	set.seed(42)
	options(digits=3)

	# --- Parameters
	# sibling pairs
	N = 100e3
	# unadmixed individuals (for population estimate)
	N_unadmixed = 10e3
	# number of variants
	M = 100
	# heritability
	hsq = 0.8
	# number of runs per architecture
	seeds = 5
	# frequency in pop1
	frq_pop1 = 0.9
	# frequency in pop2
	frq_pop2 = 0.1
	# ---

	# For storing results
	all_1cv_est = vector()
	all_1cv_true = vector()
	all_100cv_est = vector()
	all_100cv_true = vector()

	# Iterate across increasing levels of frequency differentiation
	for ( frq_pop1 in seq(0.2,0.9,by=0.1) ) {

	# For reference, calculate the expected phenotype difference based on allele frequencies and heritability
	frq_mean = (frq_pop1 + frq_pop2)/2
	expected_diff = 2 * ( frq_pop1 - frq_pop2 ) * sqrt ( hsq / (2frq_mean(1-frq_mean)) )
	cat("p1" , frq_pop1 , "p2" , frq_pop2, "Expected absolute effect", expected_diff,'\n')
	cat( "# causal" , "Pop Effect", "Effect SE" , "WF effect" , "WF SE", "BF effect" , "BF effect" , "BF SE\n", sep='\t' )

	# Iterate over different numbers of causal variants
	for ( M_causal in c(1,100) ) {
	# for storing results
	est_wf = rep(NA,seeds)
	est_bf = rep(NA,seeds)
	est_pop = rep(NA,seeds)
	# -

	for ( s in 1:seeds ) {
	# Sample causal variants
	i_causal = sample(M,M_causal)
	# Default allele frequency
	maf = 0.5
	# sample 0/1 haplotypes for parents
	mate1_hap_a = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
	mate1_hap_b = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
	mate2_hap_a = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
	mate2_hap_b = matrix( rbinom(N*M,1,maf) , nrow=N , ncol=M )
	# sample 0/1 ancestry for parents
	mate1_anc_a = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M )
	mate1_anc_b = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M )
	mate2_anc_a = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M )
	mate2_anc_b = matrix( rbinom(N*M,1,0.5) , nrow=N , ncol=M )
	# differentiate the causal variants between populations
	if ( TRUE ) {
	for ( i in i_causal ) {
	# set population 1 to be all carriers
	mate1_hap_a[mate1_anc_a[,i] == 1,i] = rbinom(sum(mate1_anc_a[,i] == 1),1,frq_pop1)
	mate1_hap_b[mate1_anc_b[,i] == 1,i] = rbinom(sum(mate1_anc_b[,i] == 1),1,frq_pop1)
	mate2_hap_a[mate2_anc_a[,i] == 1,i] = rbinom(sum(mate2_anc_a[,i] == 1),1,frq_pop1)
	mate2_hap_b[mate2_anc_b[,i] == 1,i] = rbinom(sum(mate2_anc_b[,i] == 1),1,frq_pop1)
	# set population 0 to be all non-carriers
	mate1_hap_a[mate1_anc_a[,i] == 0,i] = rbinom(sum(mate1_anc_a[,i] == 0),1,frq_pop2)
	mate1_hap_b[mate1_anc_b[,i] == 0,i] = rbinom(sum(mate1_anc_b[,i] == 0),1,frq_pop2)
	mate2_hap_a[mate2_anc_a[,i] == 0,i] = rbinom(sum(mate2_anc_a[,i] == 0),1,frq_pop2)
	mate2_hap_b[mate2_anc_b[,i] == 0,i] = rbinom(sum(mate2_anc_b[,i] == 0),1,frq_pop2)
	}
	}

	# --- generate sibling 1
	# hap a
	child1_hap_a = mate1_hap_a
	child1_anc_a = mate1_anc_a
	sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
	child1_hap_a[ !sampled_vars ] = mate1_hap_b[ !sampled_vars ]
	child1_anc_a[ !sampled_vars ] = mate1_anc_b[ !sampled_vars ]
	# hap b
	child1_hap_b = mate2_hap_a
	child1_anc_b = mate2_anc_a
	sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
	child1_hap_b[ !sampled_vars ] = mate2_hap_b[ !sampled_vars ]
	child1_anc_b[ !sampled_vars ] = mate2_anc_b[ !sampled_vars ]
	# --- generate sibling 2
	# hap a
	child2_hap_a = mate1_hap_a
	child2_anc_a = mate1_anc_a
	sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
	child2_hap_a[ !sampled_vars ] = mate1_hap_b[ !sampled_vars ]
	child2_anc_a[ !sampled_vars ] = mate1_anc_b[ !sampled_vars ]
	# hap b
	child2_hap_b = mate2_hap_a
	child2_anc_b = mate2_anc_a
	sampled_vars = matrix( as.logical(rbinom(N*M,1,0.5)) , nrow=N , ncol=M )
	child2_hap_b[ !sampled_vars ] = mate2_hap_b[ !sampled_vars ]
	child2_anc_b[ !sampled_vars ] = mate2_anc_b[ !sampled_vars ]

	# --- add some people with 100% ancestry for a more conventional analysis (same code as before)
	# sample 0/1 haplotypes for parents
	mate1_hap_a = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M )
	mate1_hap_b = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M )
	mate2_hap_a = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M )
	mate2_hap_b = matrix( rbinom(N_unadmixed*M,1,maf) , nrow=N_unadmixed , ncol=M )
	# 100% ancestry
	mate1_anc_a = matrix( 1 , nrow=N_unadmixed , ncol=M )
	mate1_anc_b = matrix( 1 , nrow=N_unadmixed , ncol=M )
	mate2_anc_a = matrix( 0 , nrow=N_unadmixed , ncol=M )
	mate2_anc_b = matrix( 0 , nrow=N_unadmixed , ncol=M )
	# fix the causal variants between populations
	for ( i in i_causal ) {
	# set population 1 to be all carriers
	mate1_hap_a[mate1_anc_a[,i] == 1,i] = rbinom(sum(mate1_anc_a[,i] == 1),1,frq_pop1)
	mate1_hap_b[mate1_anc_b[,i] == 1,i] = rbinom(sum(mate1_anc_b[,i] == 1),1,frq_pop1)
	mate2_hap_a[mate2_anc_a[,i] == 1,i] = rbinom(sum(mate2_anc_a[,i] == 1),1,frq_pop1)
	mate2_hap_b[mate2_anc_b[,i] == 1,i] = rbinom(sum(mate2_anc_b[,i] == 1),1,frq_pop1)
	# set population 0 to be all non-carriers
	mate1_hap_a[mate1_anc_a[,i] == 0,i] = rbinom(sum(mate1_anc_a[,i] == 0),1,frq_pop2)
	mate1_hap_b[mate1_anc_b[,i] == 0,i] = rbinom(sum(mate1_anc_b[,i] == 0),1,frq_pop2)
	mate2_hap_a[mate2_anc_a[,i] == 0,i] = rbinom(sum(mate2_anc_a[,i] == 0),1,frq_pop2)
	mate2_hap_b[mate2_anc_b[,i] == 0,i] = rbinom(sum(mate2_anc_b[,i] == 0),1,frq_pop2)
	}

	# --- merge the admixed and unadmixed samples and label them
	all_genos = rbind(child1_hap_a+child1_hap_b,child2_hap_a+child2_hap_b,mate1_hap_a+mate1_hap_b,mate2_hap_a+mate2_hap_b)
	all_anc = rbind(child1_anc_a+child1_anc_b,child2_anc_a+child2_anc_b,mate1_anc_a+mate1_anc_b,mate2_anc_a+mate2_anc_b)
	all_global_anc = apply( all_anc , 1 , sum ) / (2*M)
	type = c( rep("siba",N) , rep("sibb",N) , rep("pop1",N_unadmixed) , rep("pop0",N_unadmixed) )

	# --- generate heritable phenotype in the entire population with a random environment
	b_causal = rnorm(M_causal,0,1)
	all_y = scale( all_genos[,i_causal,drop=F] %% b_causal )sqrt(hsq) + rnorm(nrow(all_genos),0,sqrt(1-hsq))

	# Run the regression analysis
	# --- joint regression (ala Wang et al.)
	fam_mean = (all_global_anc[type=="siba"] + all_global_anc[type=="sibb"])/2
	# sibling regression
	reg_wf = summary(lm( (all_y[type=="siba"]) ~ (all_global_anc[type=="siba"]) + (fam_mean) ))
	# pop regression without admixture
	reg_pop = summary(lm(all_y[(type=="pop1" \| type=="pop0")] ~ all_global_anc[(type=="pop1" \| type=="pop0")] ))
	#est_wf[s] = reg$coef[2,1]
	#est_bf[s] = reg$coef[3,1]
	#est_pop[s] = reg$coef[2,1]

	cat( M_causal , "\t" , reg_pop$coef[2,1] , "\t(" , reg_pop$coef[2,2] , ")\t" , reg_wf$coef[2,1] , "\t(" , reg_wf$coef[2,2] , ")\t" , reg_wf$coef[3,1] , "\t(" , reg_wf$coef[3,2] , ")\n" , sep='' )

	# store all of the results for plotting
	if ( M_causal == 1 ) {
	all_1cv_est = c(all_1cv_est,reg_wf$coef[2,1])
	all_1cv_true = c(all_1cv_true,reg_pop$coef[2,1])
	} else if ( M_causal == 100 ) {
	all_100cv_est = c(all_100cv_est,reg_wf$coef[2,1])
	all_100cv_true = c(all_100cv_true,reg_pop$coef[2,1])
	}
	}
	}
	}

	# Visualize the results from the two architectures
	clr = c("#d53e4f","#fc8d59")
	plot( all_1cv_true , all_1cv_est , xlim=c(-2.5,2.5) , ylim=c(-2.5,2.5) , type="n" , las=1 , bty="n" , xlab="True population difference" , ylab="Estimated Within-Family Effect")
	abline(0,1,lty=2)
	points( all_1cv_true , all_1cv_est , pch=19 , col=paste(clr[1],"50",sep='') )
	points( all_1cv_true , all_1cv_est , col=clr[1] )
	reg1 = (lm(all_1cv_est~all_1cv_true))
	abline(reg1,col=clr[1],lty=3)
	points( all_100cv_true , all_100cv_est , pch=19 , col=paste(clr[2],"50",sep=''))
	points( all_100cv_true , all_100cv_est , col=clr[2])
	reg100 = (lm(all_100cv_est~all_100cv_true))
	abline(reg100,col=clr[2],lty=3)
	legend("topleft",pch=19,col=clr,legend=c(paste("1 causal (slope=",round(reg1$coef[2],2)," s.e. ",round(summary(reg1)$coef[2,2],2),")",sep=''),paste("100 causal (slope=",round(reg100$coef[2],2)," s.e. ",round(summary(reg100)$coef[2,2],2),")",sep='')),bty="n",cex=0.75)