florianhartig · March 12, 2025 15:53
diff --git a/gistfile1.txt b/gistfile1.txt
 # Question of this simulation - does the distributional density of training
 # data bias machine learning predictions in a way akin to a Bayesian
 # prior

 # changing nSim and nObs in the beginning of the script shows that
 # differences between the two priors in the exact Bayesian estimator
 # is independent on the number of simulations (as you would expect), 
 # however, bias in the ML methods is depend on the number of simulations
 # the more simulations we do (from the same parameter distribution), 
 # the smaller the bias

 nSim = 50000
 nObs = 50
 SimulationStochasticity = 0.1
 train = 1:(nSim*4/5) # we take 4/5 of the data for training
 test = (max(train) + 1):nSim

 # creating two types of training data, one from a normal distribution 
 # around 0, sd = 1, and one from a normal distribution around 2, sd = 1

 simDistribution1 = rnorm(nSim, mean = 0) 
 simDistribution2 = rnorm(nSim, mean = 2)

 generateSimulation <- function(mean){
  rnorm(nObs, mean = mean, sd = SimulationStochasticity) 
 }

 data1 = data.frame(t(sapply(simDistribution1, FUN = generateSimulation)))
 data2 = data.frame(t(sapply(simDistribution2, FUN = generateSimulation)))

 #######################################
 # Simulation-based inference 

 #library(ranger)

 model1 = ranger(x = data.frame(ss=rowSums(data1[train, ])), y = simDistribution1[train])
 model2 = ranger(x = data.frame(ss=rowSums(data2[train, ])), y = simDistribution2[train])
 
 pred1 = predict(model1, data = data.frame(ss=rowSums(data1[test, ])))
 pred2 = predict(model2, data = data.frame(ss=rowSums(data1[test, ])))
                                        #
 ## Linear model on a 1D sufficient statistic (sum of observations)

 summarized.data1 = data.frame(ss=rowSums(data1[train, ]), y=simDistribution1[train])
 lmod1 = lm(y ~ ss, summarized.data1)
 #summary(lmod1)
 lpred1 = predict(lmod1, newdata=data.frame(ss=rowSums(data1[test, ])))

 summarized.data2 = data.frame(ss=rowSums(data2[train, ]), y=simDistribution2[train])
 lmod2 = lm(y ~ ss, summarized.data2)
 #summary(lmod2)
 lpred2 = predict(lmod2, newdata=data.frame(ss=rowSums(data1[test, ])))

 #######################################
 # Exact Bayesian inference 
 # We're estimating the posterior mean for the 
 # mean parameter with fixed sd, using the formula provided in 
 # https://en.wikipedia.org/wiki/Conjugate_prior

 # x = data1[1, ]

 getPosterior1 <- function(x){
  priorMean = 0
  x = unlist(x)
  1/(1 + nObs) * (priorMean  + sum(x))
 }

 posteriorMean1 = apply(data1[test, ], 1, FUN = getPosterior1)

 getPosterior2 <- function(x){
  priorMean = 2
  x = unlist(x)
  1/(1 + nObs) * (priorMean  + sum(x))
 }

 posteriorMean2 = apply(data1[test, ], 1, FUN = getPosterior2)

 # NPE estimates
 #mean(pred1$predictions)
 #mean(pred2$predictions) # slightly higher

 mean(lpred1)
 mean(lpred2)

 mean(posteriorMean1)
 mean(posteriorMean2)

 #png(file='linNpeVsTrue.png')
 plot(posteriorMean2, lpred2)
 abline(0,1)
 #dev.off()

 # comparison of lm and RF 

 par(mfrow = c(1,2))
 plot(simDistribution1[test], lpred1, ylim = c(-3,3), xlim = c(-3,3), xlab = "True", ylab = "predicted", main = "lm, simstoch = 0.1")
 abline(0,1)
 abline(lm(lpred1 ~ simDistribution1[test]), col = "red")

 plot(simDistribution1[test], pred1$predictions, ylim = c(-3,3), xlim = c(-3,3), xlab = "True", ylab = "predicted", main = "random Forest, simstoch = 0.1")
 abline(0,1)
 abline(lm(pred1$predictions ~ simDistribution1[test]), col = "red")
	# Question of this simulation - does the distributional density of training
	# data bias machine learning predictions in a way akin to a Bayesian
	# prior

	# changing nSim and nObs in the beginning of the script shows that
	# differences between the two priors in the exact Bayesian estimator
	# is independent on the number of simulations (as you would expect),
	# however, bias in the ML methods is depend on the number of simulations
	# the more simulations we do (from the same parameter distribution),
	# the smaller the bias

	nSim = 50000
	nObs = 50
	SimulationStochasticity = 0.1
	train = 1:(nSim*4/5) # we take 4/5 of the data for training
	test = (max(train) + 1):nSim

	# creating two types of training data, one from a normal distribution
	# around 0, sd = 1, and one from a normal distribution around 2, sd = 1

	simDistribution1 = rnorm(nSim, mean = 0)
	simDistribution2 = rnorm(nSim, mean = 2)

	generateSimulation <- function(mean){
	rnorm(nObs, mean = mean, sd = SimulationStochasticity)
	}

	data1 = data.frame(t(sapply(simDistribution1, FUN = generateSimulation)))
	data2 = data.frame(t(sapply(simDistribution2, FUN = generateSimulation)))

	#######################################
	# Simulation-based inference

	#library(ranger)

	model1 = ranger(x = data.frame(ss=rowSums(data1[train, ])), y = simDistribution1[train])
	model2 = ranger(x = data.frame(ss=rowSums(data2[train, ])), y = simDistribution2[train])

	pred1 = predict(model1, data = data.frame(ss=rowSums(data1[test, ])))
	pred2 = predict(model2, data = data.frame(ss=rowSums(data1[test, ])))
	#
	## Linear model on a 1D sufficient statistic (sum of observations)

	summarized.data1 = data.frame(ss=rowSums(data1[train, ]), y=simDistribution1[train])
	lmod1 = lm(y ~ ss, summarized.data1)
	#summary(lmod1)
	lpred1 = predict(lmod1, newdata=data.frame(ss=rowSums(data1[test, ])))

	summarized.data2 = data.frame(ss=rowSums(data2[train, ]), y=simDistribution2[train])
	lmod2 = lm(y ~ ss, summarized.data2)
	#summary(lmod2)
	lpred2 = predict(lmod2, newdata=data.frame(ss=rowSums(data1[test, ])))

	#######################################
	# Exact Bayesian inference
	# We're estimating the posterior mean for the
	# mean parameter with fixed sd, using the formula provided in
	# https://en.wikipedia.org/wiki/Conjugate_prior

	# x = data1[1, ]

	getPosterior1 <- function(x){
	priorMean = 0
	x = unlist(x)
	1/(1 + nObs) * (priorMean + sum(x))
	}

	posteriorMean1 = apply(data1[test, ], 1, FUN = getPosterior1)

	getPosterior2 <- function(x){
	priorMean = 2
	x = unlist(x)
	1/(1 + nObs) * (priorMean + sum(x))
	}

	posteriorMean2 = apply(data1[test, ], 1, FUN = getPosterior2)

	# NPE estimates
	#mean(pred1$predictions)
	#mean(pred2$predictions) # slightly higher

	mean(lpred1)
	mean(lpred2)

	mean(posteriorMean1)
	mean(posteriorMean2)

	#png(file='linNpeVsTrue.png')
	plot(posteriorMean2, lpred2)
	abline(0,1)
	#dev.off()

	# comparison of lm and RF

	par(mfrow = c(1,2))
	plot(simDistribution1[test], lpred1, ylim = c(-3,3), xlim = c(-3,3), xlab = "True", ylab = "predicted", main = "lm, simstoch = 0.1")
	abline(0,1)
	abline(lm(lpred1 ~ simDistribution1[test]), col = "red")

	plot(simDistribution1[test], pred1$predictions, ylim = c(-3,3), xlim = c(-3,3), xlab = "True", ylab = "predicted", main = "random Forest, simstoch = 0.1")
	abline(0,1)
	abline(lm(pred1$predictions ~ simDistribution1[test]), col = "red")