Skip to content

Instantly share code, notes, and snippets.

@remykarem
Last active December 31, 2022 06:19
Show Gist options
  • Save remykarem/d5eff32a67ceeec8e653d1016525a649 to your computer and use it in GitHub Desktop.
Save remykarem/d5eff32a67ceeec8e653d1016525a649 to your computer and use it in GitHub Desktop.
\documentclass[8pt]{article}
\usepackage[usenames]{color} %used for font color
\usepackage{amssymb} %maths
\usepackage{amsmath} %maths
%\usepackage[utf8]{inputenc} %useful to type directly diacritic characters
\begin{document}
\begin{align*}
\text{Vanilla SGD} \\
w_{t+1} &= w_t - \alpha \frac{\partial L}{\partial w_t} \\
\text{Momentum} \\
w_{t+1} &= w_t - \alpha m_t \\
m_t &= \beta m_{t-1} + (1 - \beta) \frac{\partial L}{\partial w_t} \\
\text{Adagrad} \\
w_{t+1} &= w_t - \frac{\alpha}{\sqrt{v_t + \epsilon}} \cdot \frac{\partial L}{\partial w_t} \\
v_t &= v_{t-1} + \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\
\text{RMSprop} \\
w_{t+1} &= w_t - \frac{\alpha}{\sqrt{v_t + \epsilon}} \cdot \frac{\partial L}{\partial w_t} \\
v_t &= \beta v_{t-1} + (1 - \beta) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\
\text{Adadelta} \\
w_{t+1} &= w_t - \frac{\sqrt{D_{t-1} + \epsilon}}{\sqrt{v_t + \epsilon}} \cdot \frac{\partial L}{\partial w_t} \\
D_t &= \beta D_{t-1} + (1-\beta) [\Delta w_t]^2 \\
v_t &= \beta v_{t-1} + (1-\beta) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\
\text{Nesterov} \\
w_{t+1} &= w_t - \alpha m_t \\
m_t &= \beta m_{t-1} + (1 - \beta) \frac{\partial L}{\partial w^*} \\
w^* &= w_t - \alpha m_{t-1} \\
\text{Adam} \\
w_{t+1} &= w_t - \frac{\alpha}{\sqrt{\hat{v_t}} + \epsilon} \cdot \hat{m_t} \\
\hat{m_t} &= \frac{m_t}{1-\beta_1^t} \\
\hat{v_t} &= \frac{v_t}{1-\beta_2^t} \\
m_t &= \beta_1 m_{t-1} + (1 - \beta_1) \frac{\partial L}{\partial w_t} \\
v_t &= \beta_2 v_{t-1} + (1 - \beta_2) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\
\text{AdaMax} \\
w_{t+1} &= w_t - \frac{\alpha}{v_t} \cdot \hat{m_t} \\
\hat{m_t} &= \frac{m_t}{1-\beta_1^t} \\
m_t &= \beta_1 m_{t-1} + (1 - \beta_1) \frac{\partial L}{\partial w_t} \\
v_t &= \text{max}(\beta_2 v_{t-1}, \bigg|\frac{\partial L}{\partial w_t}\bigg|) \\
\text{Nadam} \\
w_{t+1} &= w_t - \frac{\alpha}{\sqrt{\hat{v}_t} + \epsilon} \bigg(\beta_1 \hat{m}_{t} + \frac{1-\beta_1}{1-\beta_1^t} \cdot \frac{\partial L}{\partial w_t} \bigg) \\
\hat{m_t} &= \frac{m_t}{1-\beta_1^t} \\
\hat{v_t} &= \frac{v_t}{1-\beta_2^t} \\
m_t &= \beta_1 m_{t-1} + (1 - \beta_1) \frac{\partial L}{\partial w_t} \\
v_t &= \beta_2 v_{t-1} + (1 - \beta_2) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\
\text{AMSGrad} \\
w_{t+1} &= w_t - \frac{\alpha}{\sqrt {\hat{v}_t} + \epsilon} \cdot m_t \\
m_t &= \beta_1 m_{t-1} + (1 - \beta_1) \frac{\partial L}{\partial w_t} \\
v_t &= \beta_2 v_{t-1} + (1 - \beta_2) \bigg[\frac{\partial L}{\partial w_t}\bigg]^{2} \\
\hat{v}_t &= \text{max}(\hat{v}_{t-1}, v_t)
%%\begin{tabular}{|l|c|c|c|}
%%\hline
%%\multicolumn{1}{|c|}{\textbf{Optimiser}} & \textbf{Year} & \textbf{Learning Rate} & \textbf{Gradient} \\ \hline
%%Nesterov & 1983 & & \checkmark \\ \hline
%%Momentum & 1999 & & \checkmark \\ \hline
%%AdaGrad & 2011 & \checkmark & \\ \hline
%%RMSprop & 2012 & \checkmark & \\ \hline
%%Adadelta & 2012 & \checkmark & \\ \hline
%%Adam & 2014 & \checkmark & \checkmark \\ \hline
%%AdaMax & 2015 & \checkmark & \checkmark \\ \hline
%%Nadam & 2015 & \checkmark & \checkmark \\ \hline
%%AMSGrad & 2018 & \checkmark & \checkmark \\ \hline
%\end{tabular}
\end{align*}
\end{document}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment