-
-
Save insujeon/317b3f461b0ae735fd32218cfc23282d to your computer and use it in GitHub Desktop.
Bayes by Backprop in PyTorch (introduced in the paper "Weight uncertainty in Neural Networks", Blundell et. al. 2015)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Drawn from https://gist.github.com/rocknrollnerd/c5af642cf217971d93f499e8f70fcb72 (in Theano) | |
# This is implemented in PyTorch | |
# Author : Anirudh Vemula | |
import torch | |
import torch.nn as nn | |
from torch.autograd import Variable | |
import numpy as np | |
from sklearn.datasets import fetch_mldata | |
from sklearn.cross_validation import train_test_split | |
from sklearn import preprocessing | |
def log_gaussian(x, mu, sigma): | |
return float(-0.5 * np.log(2 * np.pi) - np.log(np.abs(sigma))) - (x - mu)**2 / (2 * sigma**2) | |
def log_gaussian_logsigma(x, mu, logsigma): | |
return float(-0.5 * np.log(2 * np.pi)) - logsigma - (x - mu)**2 / (2 * torch.exp(logsigma)**2) | |
class MLPLayer(nn.Module): | |
def __init__(self, n_input, n_output, sigma_prior): | |
super(MLPLayer, self).__init__() | |
self.n_input = n_input | |
self.n_output = n_output | |
self.sigma_prior = sigma_prior | |
self.W_mu = nn.Parameter(torch.Tensor(n_input, n_output).normal_(0, 0.01)) | |
self.W_logsigma = nn.Parameter(torch.Tensor(n_input, n_output).normal_(0, 0.01)) | |
self.b_mu = nn.Parameter(torch.Tensor(n_output).uniform_(-0.01, 0.01)) | |
self.b_logsigma = nn.Parameter(torch.Tensor(n_output).uniform_(-0.01, 0.01)) | |
self.lpw = 0 | |
self.lqw = 0 | |
def forward(self, X, infer=False): | |
if infer: | |
output = torch.mm(X, self.W_mu) + self.b_mu.expand(X.size()[0], self.n_output) | |
return output | |
epsilon_W, epsilon_b = self.get_random() | |
W = self.W_mu + torch.log(1 + torch.exp(self.W_logsigma)) * epsilon_W | |
b = self.b_mu + torch.log(1 + torch.exp(self.b_logsigma)) * epsilon_b | |
output = torch.mm(X, W) + b.expand(X.size()[0], self.n_output) | |
self.lpw = log_gaussian(W, 0, self.sigma_prior).sum() + log_gaussian(b, 0, self.sigma_prior).sum() | |
self.lqw = log_gaussian_logsigma(W, self.W_mu, self.W_logsigma).sum() + log_gaussian_logsigma(b, self.b_mu, self.b_logsigma).sum() | |
return output | |
def get_random(self): | |
return Variable(torch.Tensor(self.n_input, self.n_output).normal_(0, self.sigma_prior).cuda()), Variable(torch.Tensor(self.n_output).normal_(0, self.sigma_prior).cuda()) | |
class MLP(nn.Module): | |
def __init__(self, n_input, sigma_prior): | |
super(MLP, self).__init__() | |
self.l1 = MLPLayer(n_input, 200, sigma_prior) | |
self.l1_relu = nn.ReLU() | |
self.l2 = MLPLayer(200, 200, sigma_prior) | |
self.l2_relu = nn.ReLU() | |
self.l3 = MLPLayer(200, 10, sigma_prior) | |
self.l3_softmax = nn.Softmax() | |
def forward(self, X, infer=False): | |
output = self.l1_relu(self.l1(X, infer)) | |
output = self.l2_relu(self.l2(output, infer)) | |
output = self.l3_softmax(self.l3(output, infer)) | |
return output | |
def get_lpw_lqw(self): | |
lpw = self.l1.lpw + self.l2.lpw + self.l3.lpw | |
lqw = self.l1.lqw + self.l2.lqw + self.l3.lqw | |
return lpw, lqw | |
def forward_pass_samples(X, y): | |
s_log_pw, s_log_qw, s_log_likelihood = 0., 0., 0. | |
for _ in xrange(n_samples): | |
output = net(X) | |
sample_log_pw, sample_log_qw = net.get_lpw_lqw() | |
sample_log_likelihood = log_gaussian(y, output, sigma_prior).sum() | |
s_log_pw += sample_log_pw | |
s_log_qw += sample_log_qw | |
s_log_likelihood += sample_log_likelihood | |
return s_log_pw/n_samples, s_log_qw/n_samples, s_log_likelihood/n_samples | |
def criterion(l_pw, l_qw, l_likelihood): | |
return ((1./n_batches) * (l_qw - l_pw) - l_likelihood).sum() / float(batch_size) | |
mnist = fetch_mldata('MNIST original') | |
N = 5000 | |
data = np.float32(mnist.data[:]) / 255. | |
idx = np.random.choice(data.shape[0], N) | |
data = data[idx] | |
target = np.int32(mnist.target[idx]).reshape(N, 1) | |
train_idx, test_idx = train_test_split(np.array(range(N)), test_size=0.05) | |
train_data, test_data = data[train_idx], data[test_idx] | |
train_target, test_target = target[train_idx], target[test_idx] | |
train_target = np.float32(preprocessing.OneHotEncoder(sparse=False).fit_transform(train_target)) | |
n_input = train_data.shape[1] | |
M = train_data.shape[0] | |
sigma_prior = float(np.exp(-3)) | |
n_samples = 3 | |
learning_rate = 0.001 | |
n_epochs = 100 | |
# Initialize network | |
net = MLP(n_input, sigma_prior) | |
net = net.cuda() | |
# building the objective | |
# remember, we're evaluating by samples | |
log_pw, log_qw, log_likelihood = 0., 0., 0. | |
batch_size = 100 | |
n_batches = M / float(batch_size) | |
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate) | |
n_train_batches = int(train_data.shape[0] / float(batch_size)) | |
for e in xrange(n_epochs): | |
errs = [] | |
for b in range(n_train_batches): | |
net.zero_grad() | |
X = Variable(torch.Tensor(train_data[b * batch_size: (b+1) * batch_size]).cuda()) | |
y = Variable(torch.Tensor(train_target[b * batch_size: (b+1) * batch_size]).cuda()) | |
log_pw, log_qw, log_likelihood = forward_pass_samples(X, y) | |
loss = criterion(log_pw, log_qw, log_likelihood) | |
errs.append(loss.data.cpu().numpy()) | |
loss.backward() | |
optimizer.step() | |
X = Variable(torch.Tensor(test_data).cuda(), volatile=True) | |
pred = net(X, infer=True) | |
_, out = torch.max(pred, 1) | |
acc = np.count_nonzero(np.squeeze(out.data.cpu().numpy()) == np.int32(test_target.ravel())) / float(test_data.shape[0]) | |
print 'epoch', e, 'loss', np.mean(errs), 'acc', acc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment