Skip to content

Instantly share code, notes, and snippets.

@florianhartig
Last active March 12, 2025 15:53
Show Gist options
  • Save florianhartig/d44c86047637b1394291e6c981db5564 to your computer and use it in GitHub Desktop.
Save florianhartig/d44c86047637b1394291e6c981db5564 to your computer and use it in GitHub Desktop.
Neural Posterior estimation - does the simulation distribution act as a prior distribution
# Question of this simulation - does the distributional density of training
# data bias machine learning predictions in a way akin to a Bayesian
# prior
# changing nSim and nObs in the beginning of the script shows that
# differences between the two priors in the exact Bayesian estimator
# is independent on the number of simulations (as you would expect),
# however, bias in the ML methods is depend on the number of simulations
# the more simulations we do (from the same parameter distribution),
# the smaller the bias
nSim = 50000
nObs = 50
SimulationStochasticity = 0.1
train = 1:(nSim*4/5) # we take 4/5 of the data for training
test = (max(train) + 1):nSim
# creating two types of training data, one from a normal distribution
# around 0, sd = 1, and one from a normal distribution around 2, sd = 1
simDistribution1 = rnorm(nSim, mean = 0)
simDistribution2 = rnorm(nSim, mean = 2)
generateSimulation <- function(mean){
rnorm(nObs, mean = mean, sd = SimulationStochasticity)
}
data1 = data.frame(t(sapply(simDistribution1, FUN = generateSimulation)))
data2 = data.frame(t(sapply(simDistribution2, FUN = generateSimulation)))
#######################################
# Simulation-based inference
#library(ranger)
model1 = ranger(x = data.frame(ss=rowSums(data1[train, ])), y = simDistribution1[train])
model2 = ranger(x = data.frame(ss=rowSums(data2[train, ])), y = simDistribution2[train])
pred1 = predict(model1, data = data.frame(ss=rowSums(data1[test, ])))
pred2 = predict(model2, data = data.frame(ss=rowSums(data1[test, ])))
#
## Linear model on a 1D sufficient statistic (sum of observations)
summarized.data1 = data.frame(ss=rowSums(data1[train, ]), y=simDistribution1[train])
lmod1 = lm(y ~ ss, summarized.data1)
#summary(lmod1)
lpred1 = predict(lmod1, newdata=data.frame(ss=rowSums(data1[test, ])))
summarized.data2 = data.frame(ss=rowSums(data2[train, ]), y=simDistribution2[train])
lmod2 = lm(y ~ ss, summarized.data2)
#summary(lmod2)
lpred2 = predict(lmod2, newdata=data.frame(ss=rowSums(data1[test, ])))
#######################################
# Exact Bayesian inference
# We're estimating the posterior mean for the
# mean parameter with fixed sd, using the formula provided in
# https://en.wikipedia.org/wiki/Conjugate_prior
# x = data1[1, ]
getPosterior1 <- function(x){
priorMean = 0
x = unlist(x)
1/(1 + nObs) * (priorMean + sum(x))
}
posteriorMean1 = apply(data1[test, ], 1, FUN = getPosterior1)
getPosterior2 <- function(x){
priorMean = 2
x = unlist(x)
1/(1 + nObs) * (priorMean + sum(x))
}
posteriorMean2 = apply(data1[test, ], 1, FUN = getPosterior2)
# NPE estimates
#mean(pred1$predictions)
#mean(pred2$predictions) # slightly higher
mean(lpred1)
mean(lpred2)
mean(posteriorMean1)
mean(posteriorMean2)
#png(file='linNpeVsTrue.png')
plot(posteriorMean2, lpred2)
abline(0,1)
#dev.off()
# comparison of lm and RF
par(mfrow = c(1,2))
plot(simDistribution1[test], lpred1, ylim = c(-3,3), xlim = c(-3,3), xlab = "True", ylab = "predicted", main = "lm, simstoch = 0.1")
abline(0,1)
abline(lm(lpred1 ~ simDistribution1[test]), col = "red")
plot(simDistribution1[test], pred1$predictions, ylim = c(-3,3), xlim = c(-3,3), xlab = "True", ylab = "predicted", main = "random Forest, simstoch = 0.1")
abline(0,1)
abline(lm(pred1$predictions ~ simDistribution1[test]), col = "red")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment