Skip to content

Instantly share code, notes, and snippets.

@soumith
Created December 16, 2015 01:08
Show Gist options
  • Save soumith/4c1c5a1ca98f58c4bdd1 to your computer and use it in GitHub Desktop.
Save soumith/4c1c5a1ca98f58c4bdd1 to your computer and use it in GitHub Desktop.
--[[
This file implements Batch Normalization as described in the paper:
"Batch Normalization: Accelerating Deep Network Training
by Reducing Internal Covariate Shift"
by Sergey Ioffe, Christian Szegedy
This implementation is useful for inputs coming from convolution layers.
For Non-convolutional layers, see BatchNormalization.lua
The operation implemented is:
y = ( x - mean(x) )
-------------------- * gamma + beta
standard-deviation(x)
where gamma and beta are learnable parameters.
The learning of gamma and beta is optional.
Usage:
with learnable parameters: nn.BatchNormalization(N [,eps] [,momentum])
where N = dimensionality of input
without learnable parameters: nn.BatchNormalization(0 [,eps] [,momentum])
eps is a small value added to the standard-deviation to avoid divide-by-zero.
Defaults to 1e-5
In training time, this layer keeps a running estimate of it's computed mean and std.
The running sum is kept with a default momentup of 0.1 (unless over-ridden)
In test time, this running mean/std is used to normalize.
]]--
local BN,parent = torch.class('nn.VolumetricBatchNormalization', 'nn.Module')
function BN:__init(nFeature, eps, momentum, affine)
parent.__init(self)
assert(nFeature and type(nFeature) == 'number',
'Missing argument #1: Number of feature planes. ')
assert(nFeature ~= 0, 'To set affine=false call VolumetricBatchNormalization'
.. '(nFeature, eps, momentum, false) ')
if affine ~=nil then
assert(type(affine) == 'boolean', 'affine has to be true/false')
self.affine = affine
else
self.affine = true
end
self.eps = eps or 1e-5
self.train = true
self.momentum = momentum or 0.1
self.running_mean = torch.zeros(nFeature)
self.running_std = torch.ones(nFeature)
if self.affine then
self.weight = torch.Tensor(nFeature)
self.bias = torch.Tensor(nFeature)
self.gradWeight = torch.Tensor(nFeature)
self.gradBias = torch.Tensor(nFeature)
self:reset()
end
end
function BN:reset()
self.weight:uniform()
self.bias:zero()
end
function BN:updateOutput(input)
assert(input:dim() == 5, 'only mini-batch supported (5D tensor), got '
.. input:dim() .. 'D tensor instead')
local nBatch = input:size(1)
local nFeature = input:size(2)
local iT = input:size(3)
local iH = input:size(4)
local iW = input:size(5)
-- buffers that are reused
self.buffer = self.buffer or input.new()
self.buffer2 = self.buffer2 or input.new()
self.centered = self.centered or input.new()
self.centered:resizeAs(input)
self.std = self.std or input.new()
self.normalized = self.normalized or input.new()
self.normalized:resizeAs(input)
self.output:resizeAs(input)
self.gradInput:resizeAs(input)
if self.train == false then
self.output:copy(input)
self.buffer:repeatTensor(self.running_mean:view(1, nFeature, 1, 1, 1), nBatch, 1, iT, iH, iW)
self.output:add(-1, self.buffer)
self.buffer:repeatTensor(self.running_std:view(1, nFeature, 1, 1, 1), nBatch, 1, iT, iH, iW)
self.output:cmul(self.buffer)
else -- training mode
-- calculate mean over mini-batch, over feature-maps
local in_folded = input:view(nBatch, nFeature, iT*iH*iW)
self.buffer:mean(in_folded, 1)
self.buffer2:mean(self.buffer, 3)
self.running_mean:mul(1 - self.momentum):add(self.momentum, self.buffer2) -- add to running mean
self.buffer:repeatTensor(self.buffer2:view(1, nFeature, 1, 1, 1),
nBatch, 1, iT, iH, iW)
-- subtract mean
self.centered:add(input, -1, self.buffer) -- x - E(x)
-- calculate standard deviation over mini-batch
self.buffer:copy(self.centered):cmul(self.buffer) -- [x - E(x)]^2
local buf_folded = self.buffer:view(nBatch, nFeature, iT*iH*iW)
self.std:mean(self.buffer2:mean(buf_folded, 1), 3)
self.std:add(self.eps):sqrt():pow(-1) -- 1 / E([x - E(x)]^2)
self.running_std:mul(1 - self.momentum):add(self.momentum, self.std) -- add to running stdv
self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1, 1),
nBatch, 1, iT, iH, iW)
-- divide standard-deviation + eps
self.output:cmul(self.centered, self.buffer)
self.normalized:copy(self.output)
end
if self.affine then
-- multiply with gamma and add beta
self.buffer:repeatTensor(self.weight:view(1, nFeature, 1, 1, 1),
nBatch, 1, iT, iH, iW)
self.output:cmul(self.buffer)
self.buffer:repeatTensor(self.bias:view(1, nFeature, 1, 1, 1),
nBatch, 1, iT, iH, iW)
self.output:add(self.buffer)
end
return self.output
end
function BN:updateGradInput(input, gradOutput)
assert(input:dim() == 5, 'only mini-batch supported')
assert(gradOutput:dim() == 5, 'only mini-batch supported')
assert(self.train == true, 'should be in training mode when self.train is true')
local nBatch = input:size(1)
local nFeature = input:size(2)
local iT = input:size(3)
local iH = input:size(4)
local iW = input:size(5)
self.gradInput:cmul(self.centered, gradOutput)
local gi_folded = self.gradInput:view(nBatch, nFeature, iT*iH*iW)
self.buffer2:mean(self.buffer:mean(gi_folded, 1), 3)
self.gradInput:repeatTensor(self.buffer2:view(1, nFeature, 1, 1, 1),
nBatch, 1, iT, iH, iW)
self.gradInput:cmul(self.centered):mul(-1)
self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1, 1),
nBatch, 1, iT, iH, iW)
self.gradInput:cmul(self.buffer):cmul(self.buffer)
self.buffer:mean(gradOutput:view(nBatch, nFeature, iT*iH*iW), 1)
self.buffer2:mean(self.buffer, 3)
self.buffer:repeatTensor(self.buffer2:view(1, nFeature, 1, 1, 1),
nBatch, 1, iT, iH, iW)
self.gradInput:add(gradOutput):add(-1, self.buffer)
self.buffer:repeatTensor(self.std:view(1, nFeature, 1, 1, 1),
nBatch, 1, iT, iH, iW)
self.gradInput:cmul(self.buffer)
if self.affine then
self.buffer:repeatTensor(self.weight:view(1, nFeature, 1, 1, 1),
nBatch, 1, iT, iH, iW)
self.gradInput:cmul(self.buffer)
end
return self.gradInput
end
function BN:accGradParameters(input, gradOutput, scale)
if self.affine then
scale = scale or 1.0
local nBatch = input:size(1)
local nFeature = input:size(2)
local iT = input:size(3)
local iH = input:size(4)
local iW = input:size(5)
self.buffer2:resizeAs(self.normalized):copy(self.normalized)
self.buffer2 = self.buffer2:cmul(gradOutput):view(nBatch, nFeature, iT*iH*iW)
self.buffer:sum(self.buffer2, 1) -- sum over mini-batch
self.buffer2:sum(self.buffer, 3) -- sum over pixels
self.gradWeight:add(scale, self.buffer2)
self.buffer:sum(gradOutput:view(nBatch, nFeature, iT*iH*iW), 1)
self.buffer2:sum(self.buffer, 3)
self.gradBias:add(scale, self.buffer2) -- sum over mini-batch
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment