SuvroBaner · December 24, 2019 03:42
diff --git a/adam.py b/adam.py
 def initialize_adam(parameters) :
    """
    Initializes v and s as two python dictionaries with:
                - keys: "dW1", "db1", ..., "dWL", "dbL" 
                - values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.
    
    Arguments:
    parameters -- python dictionary containing your parameters.
                    parameters["W" + str(l)] = Wl
                    parameters["b" + str(l)] = bl
    
    Returns: 
    v -- python dictionary that will contain the exponentially weighted average of the gradient.
                    v["dW" + str(l)] = ...
                    v["db" + str(l)] = ...
    s -- python dictionary that will contain the exponentially weighted average of the squared gradient.
                    s["dW" + str(l)] = ...
                    s["db" + str(l)] = ...

    """
    
    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}
    
    # Initialize v, s. Input: "parameters". Outputs: "v, s".
    for l in range(L):
        v["dW" + str(l+1)] = np.zeros((parameters["W" + str(l+1)]).shape)
        v["db" + str(l+1)] = np.zeros((parameters["b" + str(l+1)]).shape)
        s["dW" + str(l+1)] = np.zeros((parameters["W" + str(l+1)]).shape)
        s["db" + str(l+1)] = np.zeros((parameters["b" + str(l+1)]).shape)
    
    return v, s

 def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,
                                beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):
    """
    Update parameters using Adam
    
    Arguments:
    parameters -- python dictionary containing your parameters:
                    parameters['W' + str(l)] = Wl
                    parameters['b' + str(l)] = bl
    grads -- python dictionary containing your gradients for each parameters:
                    grads['dW' + str(l)] = dWl
                    grads['db' + str(l)] = dbl
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    learning_rate -- the learning rate, scalar.
    beta1 -- Exponential decay hyperparameter for the first moment estimates 
    beta2 -- Exponential decay hyperparameter for the second moment estimates 
    epsilon -- hyperparameter preventing division by zero in Adam updates

    Returns:
    parameters -- python dictionary containing your updated parameters 
    v -- Adam variable, moving average of the first gradient, python dictionary
    s -- Adam variable, moving average of the squared gradient, python dictionary
    """
    
    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         # Initializing first moment estimate, python dictionary
    s_corrected = {}                         # Initializing second moment estimate, python dictionary
    
    # Perform Adam update on all parameters
    for l in range(L):
        # Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
        v["dW" + str(l+1)] = beta1*v["dW" + str(l+1)] + (1 - beta1)*grads['dW' + str(l+1)]
        v["db" + str(l+1)] = beta1*v["db" + str(l+1)] + (1 - beta1)*grads['db' + str(l+1)]

        # Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1 - np.power(beta1, t))
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1 - np.power(beta1, t))

        # Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
        s["dW" + str(l+1)] = beta2*s["dW" + str(l+1)] + (1 - beta2)*np.power((grads['dW' + str(l+1)]), 2)
        s["db" + str(l+1)] = beta2*s["db" + str(l+1)] + (1 - beta2)*np.power((grads['db' + str(l+1)]), 2)

        # Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1 - np.power(beta2, t))
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1 - np.power(beta2, t))

        # Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*(v_corrected["dW" + str(l+1)] / (np.sqrt(s_corrected["dW" + str(l+1)]) + epsilon))
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*(v_corrected["db" + str(l+1)] / (np.sqrt(s_corrected["db" + str(l+1)]) + epsilon))

    return parameters, v, s
	def initialize_adam(parameters) :
	"""
	Initializes v and s as two python dictionaries with:
	- keys: "dW1", "db1", ..., "dWL", "dbL"
	- values: numpy arrays of zeros of the same shape as the corresponding gradients/parameters.

	Arguments:
	parameters -- python dictionary containing your parameters.
	parameters["W" + str(l)] = Wl
	parameters["b" + str(l)] = bl

	Returns:
	v -- python dictionary that will contain the exponentially weighted average of the gradient.
	v["dW" + str(l)] = ...
	v["db" + str(l)] = ...
	s -- python dictionary that will contain the exponentially weighted average of the squared gradient.
	s["dW" + str(l)] = ...
	s["db" + str(l)] = ...

	"""

	L = len(parameters) // 2 # number of layers in the neural networks
	v = {}
	s = {}

	# Initialize v, s. Input: "parameters". Outputs: "v, s".
	for l in range(L):
	v["dW" + str(l+1)] = np.zeros((parameters["W" + str(l+1)]).shape)
	v["db" + str(l+1)] = np.zeros((parameters["b" + str(l+1)]).shape)
	s["dW" + str(l+1)] = np.zeros((parameters["W" + str(l+1)]).shape)
	s["db" + str(l+1)] = np.zeros((parameters["b" + str(l+1)]).shape)

	return v, s

	def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,
	beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
	"""
	Update parameters using Adam

	Arguments:
	parameters -- python dictionary containing your parameters:
	parameters['W' + str(l)] = Wl
	parameters['b' + str(l)] = bl
	grads -- python dictionary containing your gradients for each parameters:
	grads['dW' + str(l)] = dWl
	grads['db' + str(l)] = dbl
	v -- Adam variable, moving average of the first gradient, python dictionary
	s -- Adam variable, moving average of the squared gradient, python dictionary
	learning_rate -- the learning rate, scalar.
	beta1 -- Exponential decay hyperparameter for the first moment estimates
	beta2 -- Exponential decay hyperparameter for the second moment estimates
	epsilon -- hyperparameter preventing division by zero in Adam updates

	Returns:
	parameters -- python dictionary containing your updated parameters
	v -- Adam variable, moving average of the first gradient, python dictionary
	s -- Adam variable, moving average of the squared gradient, python dictionary
	"""

	L = len(parameters) // 2 # number of layers in the neural networks
	v_corrected = {} # Initializing first moment estimate, python dictionary
	s_corrected = {} # Initializing second moment estimate, python dictionary

	# Perform Adam update on all parameters
	for l in range(L):
	# Moving average of the gradients. Inputs: "v, grads, beta1". Output: "v".
	v["dW" + str(l+1)] = beta1v["dW" + str(l+1)] + (1 - beta1)grads['dW' + str(l+1)]
	v["db" + str(l+1)] = beta1v["db" + str(l+1)] + (1 - beta1)grads['db' + str(l+1)]

	# Compute bias-corrected first moment estimate. Inputs: "v, beta1, t". Output: "v_corrected".
	v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1 - np.power(beta1, t))
	v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1 - np.power(beta1, t))

	# Moving average of the squared gradients. Inputs: "s, grads, beta2". Output: "s".
	s["dW" + str(l+1)] = beta2s["dW" + str(l+1)] + (1 - beta2)np.power((grads['dW' + str(l+1)]), 2)
	s["db" + str(l+1)] = beta2s["db" + str(l+1)] + (1 - beta2)np.power((grads['db' + str(l+1)]), 2)

	# Compute bias-corrected second raw moment estimate. Inputs: "s, beta2, t". Output: "s_corrected".
	s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1 - np.power(beta2, t))
	s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1 - np.power(beta2, t))

	# Update parameters. Inputs: "parameters, learning_rate, v_corrected, s_corrected, epsilon". Output: "parameters".
	parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*(v_corrected["dW" + str(l+1)] / (np.sqrt(s_corrected["dW" + str(l+1)]) + epsilon))
	parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*(v_corrected["db" + str(l+1)] / (np.sqrt(s_corrected["db" + str(l+1)]) + epsilon))

	return parameters, v, s