Created
May 1, 2018 01:23
-
-
Save domarps/8e390411940a6c3b712cdaf95f009040 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def layernorm_forward(x, gamma, beta, ln_param): | |
""" | |
Forward pass for layer normalization. | |
During both training and test-time, the incoming data is normalized per data-point, | |
before being scaled by gamma and beta parameters identical to that of batch normalization. | |
Note that in contrast to batch normalization, the behavior during train and test-time for | |
layer normalization are identical, and we do not need to keep track of running averages | |
of any sort. | |
Input: | |
- x: Data of shape (N, D) | |
- gamma: Scale parameter of shape (D,) | |
- beta: Shift paremeter of shape (D,) | |
- ln_param: Dictionary with the following keys: | |
- eps: Constant for numeric stability | |
Returns a tuple of: | |
- out: of shape (N, D) | |
- cache: A tuple of values needed in the backward pass | |
""" | |
out, cache = None, None | |
eps = ln_param.get('eps', 1e-5) | |
N, D = x.shape | |
xT = x.T | |
########################################################################### | |
# TODO: Implement the training-time forward pass for layer norm. # | |
# Normalize the incoming data, and scale and shift the normalized data # | |
# using gamma and beta. # | |
# HINT: this can be done by slightly modifying your training-time # | |
# implementation of batch normalization, and inserting a line or two of # | |
# well-placed code. In particular, can you think of any matrix # | |
# transformations you could perform, that would enable you to copy over # | |
# the batch norm code and leave it almost unchanged? # | |
########################################################################### | |
feature_mean = np.mean(xT, axis = 0) # feature mean (D,) | |
feature_var = np.var(xT, axis = 0) # feature variance (D,) | |
scaled_xT = xT - feature_mean | |
normalize_xT = (xT - feature_mean)/np.sqrt(feature_var + eps) | |
normalize_x = normalize_xT.T | |
scaled_x = scaled_xT.T | |
out = gamma * normalize_x + beta | |
cache = { | |
'scaled_x' : scaled_x, # (N, D) | |
'normalized_x' : normalize_x, # (N, D) | |
'gamma' : gamma, #(D,) | |
'ivar' : 1./np.sqrt(feature_var + eps), # (D,) | |
'sqrtvar' : np.sqrt(feature_var + eps) # (D,) | |
} | |
########################################################################### | |
# END OF YOUR CODE # | |
########################################################################### | |
return out, cache | |
def layernorm_backward(dout, cache): | |
""" | |
Backward pass for layer normalization. | |
For this implementation, you can heavily rely on the work you've done already | |
for batch normalization. | |
Inputs: | |
- dout: Upstream derivatives, of shape (N, D) | |
- cache: Variable of intermediates from layernorm_forward. | |
Returns a tuple of: | |
- dx: Gradient with respect to inputs x, of shape (N, D) | |
- dgamma: Gradient with respect to scale parameter gamma, of shape (D,) | |
- dbeta: Gradient with respect to shift parameter beta, of shape (D,) | |
""" | |
dx, dgamma, dbeta = None, None, None | |
########################################################################### | |
# TODO: Implement the backward pass for layer norm. # | |
# # | |
# HINT: this can be done by slightly modifying your training-time # | |
# implementation of batch normalization. The hints to the forward pass # | |
# still apply! # | |
########################################################################### | |
N, D = dout.shape | |
normalized_x = cache.get('normalized_x') | |
gamma = cache.get('gamma') | |
ivar = cache.get('ivar') | |
scaled_x = cache.get('scaled_x') | |
sqrtvar = cache.get('sqrtvar') | |
# backprop dout to calculate dbeta and dgamma | |
dbeta = np.sum(dout, axis = 0) | |
dgamma = np.sum(dout * normalized_x, axis = 0) | |
#print(dbeta.shape, dgamma.shape, sqrtvar.shape) | |
dx.T = (1 / N) * gamma * (1/sqrtvar * ((N * dout.T)) - np.sum(dout.T, axis=0) - ((scaled_x.T) * np.square(ivar) * np.sum(dout.T*scaled_x, axis=0))) | |
dx = dx.T | |
#print(dx.shape) | |
########################################################################### | |
# END OF YOUR CODE # | |
########################################################################### | |
return dx, dgamma, dbeta |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment