Last active
December 31, 2022 06:19
-
-
Save remykarem/d5eff32a67ceeec8e653d1016525a649 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
\documentclass[8pt]{article} | |
\usepackage[usenames]{color} %used for font color | |
\usepackage{amssymb} %maths | |
\usepackage{amsmath} %maths | |
%\usepackage[utf8]{inputenc} %useful to type directly diacritic characters | |
\begin{document} | |
\begin{align*} | |
\text{Vanilla SGD} \\ | |
w_{t+1} &= w_t - \alpha \frac{\partial L}{\partial w_t} \\ | |
\text{Momentum} \\ | |
w_{t+1} &= w_t - \alpha m_t \\ | |
m_t &= \beta m_{t-1} + (1 - \beta) \frac{\partial L}{\partial w_t} \\ | |
\text{Adagrad} \\ | |
w_{t+1} &= w_t - \frac{\alpha}{\sqrt{v_t + \epsilon}} \cdot \frac{\partial L}{\partial w_t} \\ | |
v_t &= v_{t-1} + \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\ | |
\text{RMSprop} \\ | |
w_{t+1} &= w_t - \frac{\alpha}{\sqrt{v_t + \epsilon}} \cdot \frac{\partial L}{\partial w_t} \\ | |
v_t &= \beta v_{t-1} + (1 - \beta) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\ | |
\text{Adadelta} \\ | |
w_{t+1} &= w_t - \frac{\sqrt{D_{t-1} + \epsilon}}{\sqrt{v_t + \epsilon}} \cdot \frac{\partial L}{\partial w_t} \\ | |
D_t &= \beta D_{t-1} + (1-\beta) [\Delta w_t]^2 \\ | |
v_t &= \beta v_{t-1} + (1-\beta) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\ | |
\text{Nesterov} \\ | |
w_{t+1} &= w_t - \alpha m_t \\ | |
m_t &= \beta m_{t-1} + (1 - \beta) \frac{\partial L}{\partial w^*} \\ | |
w^* &= w_t - \alpha m_{t-1} \\ | |
\text{Adam} \\ | |
w_{t+1} &= w_t - \frac{\alpha}{\sqrt{\hat{v_t}} + \epsilon} \cdot \hat{m_t} \\ | |
\hat{m_t} &= \frac{m_t}{1-\beta_1^t} \\ | |
\hat{v_t} &= \frac{v_t}{1-\beta_2^t} \\ | |
m_t &= \beta_1 m_{t-1} + (1 - \beta_1) \frac{\partial L}{\partial w_t} \\ | |
v_t &= \beta_2 v_{t-1} + (1 - \beta_2) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\ | |
\text{AdaMax} \\ | |
w_{t+1} &= w_t - \frac{\alpha}{v_t} \cdot \hat{m_t} \\ | |
\hat{m_t} &= \frac{m_t}{1-\beta_1^t} \\ | |
m_t &= \beta_1 m_{t-1} + (1 - \beta_1) \frac{\partial L}{\partial w_t} \\ | |
v_t &= \text{max}(\beta_2 v_{t-1}, \bigg|\frac{\partial L}{\partial w_t}\bigg|) \\ | |
\text{Nadam} \\ | |
w_{t+1} &= w_t - \frac{\alpha}{\sqrt{\hat{v}_t} + \epsilon} \bigg(\beta_1 \hat{m}_{t} + \frac{1-\beta_1}{1-\beta_1^t} \cdot \frac{\partial L}{\partial w_t} \bigg) \\ | |
\hat{m_t} &= \frac{m_t}{1-\beta_1^t} \\ | |
\hat{v_t} &= \frac{v_t}{1-\beta_2^t} \\ | |
m_t &= \beta_1 m_{t-1} + (1 - \beta_1) \frac{\partial L}{\partial w_t} \\ | |
v_t &= \beta_2 v_{t-1} + (1 - \beta_2) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\ | |
\text{AMSGrad} \\ | |
w_{t+1} &= w_t - \frac{\alpha}{\sqrt {\hat{v}_t} + \epsilon} \cdot m_t \\ | |
m_t &= \beta_1 m_{t-1} + (1 - \beta_1) \frac{\partial L}{\partial w_t} \\ | |
v_t &= \beta_2 v_{t-1} + (1 - \beta_2) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\ | |
\hat{v}_t &= \text{max}(\hat{v}_{t-1}, v_t) | |
%%\begin{tabular}{|l|c|c|c|} | |
%%\hline | |
%%\multicolumn{1}{|c|}{\textbf{Optimiser}} & \textbf{Year} & \textbf{Learning Rate} & \textbf{Gradient} \\ \hline | |
%%Nesterov & 1983 & & \checkmark \\ \hline | |
%%Momentum & 1999 & & \checkmark \\ \hline | |
%%AdaGrad & 2011 & \checkmark & \\ \hline | |
%%RMSprop & 2012 & \checkmark & \\ \hline | |
%%Adadelta & 2012 & \checkmark & \\ \hline | |
%%Adam & 2014 & \checkmark & \checkmark \\ \hline | |
%%AdaMax & 2015 & \checkmark & \checkmark \\ \hline | |
%%Nadam & 2015 & \checkmark & \checkmark \\ \hline | |
%%AMSGrad & 2018 & \checkmark & \checkmark \\ \hline | |
%\end{tabular} | |
\end{align*} | |
\end{document} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment