Last active
March 12, 2025 15:53
-
-
Save florianhartig/d44c86047637b1394291e6c981db5564 to your computer and use it in GitHub Desktop.
Neural Posterior estimation - does the simulation distribution act as a prior distribution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Question of this simulation - does the distributional density of training | |
# data bias machine learning predictions in a way akin to a Bayesian | |
# prior | |
# changing nSim and nObs in the beginning of the script shows that | |
# differences between the two priors in the exact Bayesian estimator | |
# is independent on the number of simulations (as you would expect), | |
# however, bias in the ML methods is depend on the number of simulations | |
# the more simulations we do (from the same parameter distribution), | |
# the smaller the bias | |
nSim = 50000 | |
nObs = 50 | |
SimulationStochasticity = 0.1 | |
train = 1:(nSim*4/5) # we take 4/5 of the data for training | |
test = (max(train) + 1):nSim | |
# creating two types of training data, one from a normal distribution | |
# around 0, sd = 1, and one from a normal distribution around 2, sd = 1 | |
simDistribution1 = rnorm(nSim, mean = 0) | |
simDistribution2 = rnorm(nSim, mean = 2) | |
generateSimulation <- function(mean){ | |
rnorm(nObs, mean = mean, sd = SimulationStochasticity) | |
} | |
data1 = data.frame(t(sapply(simDistribution1, FUN = generateSimulation))) | |
data2 = data.frame(t(sapply(simDistribution2, FUN = generateSimulation))) | |
####################################### | |
# Simulation-based inference | |
#library(ranger) | |
model1 = ranger(x = data.frame(ss=rowSums(data1[train, ])), y = simDistribution1[train]) | |
model2 = ranger(x = data.frame(ss=rowSums(data2[train, ])), y = simDistribution2[train]) | |
pred1 = predict(model1, data = data.frame(ss=rowSums(data1[test, ]))) | |
pred2 = predict(model2, data = data.frame(ss=rowSums(data1[test, ]))) | |
# | |
## Linear model on a 1D sufficient statistic (sum of observations) | |
summarized.data1 = data.frame(ss=rowSums(data1[train, ]), y=simDistribution1[train]) | |
lmod1 = lm(y ~ ss, summarized.data1) | |
#summary(lmod1) | |
lpred1 = predict(lmod1, newdata=data.frame(ss=rowSums(data1[test, ]))) | |
summarized.data2 = data.frame(ss=rowSums(data2[train, ]), y=simDistribution2[train]) | |
lmod2 = lm(y ~ ss, summarized.data2) | |
#summary(lmod2) | |
lpred2 = predict(lmod2, newdata=data.frame(ss=rowSums(data1[test, ]))) | |
####################################### | |
# Exact Bayesian inference | |
# We're estimating the posterior mean for the | |
# mean parameter with fixed sd, using the formula provided in | |
# https://en.wikipedia.org/wiki/Conjugate_prior | |
# x = data1[1, ] | |
getPosterior1 <- function(x){ | |
priorMean = 0 | |
x = unlist(x) | |
1/(1 + nObs) * (priorMean + sum(x)) | |
} | |
posteriorMean1 = apply(data1[test, ], 1, FUN = getPosterior1) | |
getPosterior2 <- function(x){ | |
priorMean = 2 | |
x = unlist(x) | |
1/(1 + nObs) * (priorMean + sum(x)) | |
} | |
posteriorMean2 = apply(data1[test, ], 1, FUN = getPosterior2) | |
# NPE estimates | |
#mean(pred1$predictions) | |
#mean(pred2$predictions) # slightly higher | |
mean(lpred1) | |
mean(lpred2) | |
mean(posteriorMean1) | |
mean(posteriorMean2) | |
#png(file='linNpeVsTrue.png') | |
plot(posteriorMean2, lpred2) | |
abline(0,1) | |
#dev.off() | |
# comparison of lm and RF | |
par(mfrow = c(1,2)) | |
plot(simDistribution1[test], lpred1, ylim = c(-3,3), xlim = c(-3,3), xlab = "True", ylab = "predicted", main = "lm, simstoch = 0.1") | |
abline(0,1) | |
abline(lm(lpred1 ~ simDistribution1[test]), col = "red") | |
plot(simDistribution1[test], pred1$predictions, ylim = c(-3,3), xlim = c(-3,3), xlab = "True", ylab = "predicted", main = "random Forest, simstoch = 0.1") | |
abline(0,1) | |
abline(lm(pred1$predictions ~ simDistribution1[test]), col = "red") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment