Created
August 30, 2021 21:26
-
-
Save akaanirban/76427ef338d39b45f9858036b99f4ca3 to your computer and use it in GitHub Desktop.
Different ways to perform gradient accumulation.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Different ways to perform gradient accumulation.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyNfXRuve6/wKWr8q1FO9WQw", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/akaanirban/76427ef338d39b45f9858036b99f4ca3/different-ways-to-perform-gradient-accumulation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "BubD_LHm8FWT" | |
}, | |
"source": [ | |
"## Different ways to perform gradient accumulation " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "iMo8XExI2Qob" | |
}, | |
"source": [ | |
"import torch\n", | |
"import math" | |
], | |
"execution_count": 1, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "waFqq-nE2Sfg" | |
}, | |
"source": [ | |
"dtype = torch.float\n", | |
"device = torch.device(\"cpu\")\n", | |
"# device = torch.device(\"cuda:0\") # Uncomment this to run on GPU" | |
], | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "23pczZ6X8Ith" | |
}, | |
"source": [ | |
"### 1. Normal full batch gradient" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "fbCl2u1y2kkJ", | |
"outputId": "96fd895e-41c4-4c8d-90e2-2a1a6a3453bf" | |
}, | |
"source": [ | |
"# Create Tensors to hold input and outputs.\n", | |
"# By default, requires_grad=False, which indicates that we do not need to\n", | |
"# compute gradients with respect to these Tensors during the backward pass.\n", | |
"X = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)\n", | |
"Y = torch.sin(X)\n", | |
"\n", | |
"# Create random Tensors for weights. For a third order polynomial, we need\n", | |
"# 4 weights: y = a + b x + c x^2 + d x^3\n", | |
"# Setting requires_grad=True indicates that we want to compute gradients with\n", | |
"# respect to these Tensors during the backward pass.\n", | |
"m = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n", | |
"n = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n", | |
"o = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n", | |
"p = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n", | |
"\n", | |
"a, b, c, d = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n", | |
"a.requires_grad=True\n", | |
"b.requires_grad=True\n", | |
"c.requires_grad=True\n", | |
"d.requires_grad=True\n", | |
"\n", | |
"learning_rate = 1e-6\n", | |
"for t in range(1):\n", | |
" # Forward pass: compute predicted y using operations on Tensors.\n", | |
" y_pred = a + b * X + c * X ** 2 + d * X ** 3\n", | |
"\n", | |
" # Compute and print loss using operations on Tensors.\n", | |
" # Now loss is a Tensor of shape (1,)\n", | |
" # loss.item() gets the scalar value held in the loss.\n", | |
" loss = (y_pred - Y).pow(2).sum()\n", | |
" if t % 100 == 99:\n", | |
" print(t, loss.item())\n", | |
"\n", | |
" # Use autograd to compute the backward pass. This call will compute the\n", | |
" # gradient of loss with respect to all Tensors with requires_grad=True.\n", | |
" # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding\n", | |
" # the gradient of the loss with respect to a, b, c, d respectively.\n", | |
" loss.backward()\n", | |
"\n", | |
" # Manually update weights using gradient descent. Wrap in torch.no_grad()\n", | |
" # because weights have requires_grad=True, but we don't need to track this\n", | |
" # in autograd.\n", | |
" with torch.no_grad():\n", | |
" a -= learning_rate * a.grad\n", | |
" b -= learning_rate * b.grad\n", | |
" c -= learning_rate * c.grad\n", | |
" d -= learning_rate * d.grad\n", | |
"\n", | |
" # # Manually zero the gradients after updating weights\n", | |
"\n", | |
"\n", | |
"print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Result: y = -0.09324247390031815 + -0.17653822898864746 x + 0.2124224752187729 x^2 + 0.699370801448822 x^3\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "SGuxppb82n1T", | |
"outputId": "3a2631a6-b738-4e18-f3f2-f8746ff4ad29" | |
}, | |
"source": [ | |
"a.grad, b.grad, c.grad, d.grad" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(tensor(2655.7627),\n", | |
" tensor(113246.4297),\n", | |
" tensor(16697.1406),\n", | |
" tensor(812837.1875))" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 4 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "nOfCFx9_8OJB" | |
}, | |
"source": [ | |
"### 2. By appending the loss to a total loss and then calculating the gradient on the total loss at the end." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "6b8YcoJW3UKb", | |
"outputId": "08fd1858-7d8f-47be-abe2-082bc63450f4" | |
}, | |
"source": [ | |
"e, f, g, h = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n", | |
"e.requires_grad=True\n", | |
"f.requires_grad=True\n", | |
"g.requires_grad=True\n", | |
"h.requires_grad=True\n", | |
"\n", | |
"learning_rate = 1e-6\n", | |
"total_loss = 0\n", | |
"for i in range(2000):\n", | |
" # Forward pass: compute predicted y using operations on Tensors.\n", | |
" y_pred = e + f * X[i] + g * X[i] ** 2 + h * X[i] ** 3\n", | |
"\n", | |
" # Compute and print loss using operations on Tensors.\n", | |
" # Now loss is a Tensor of shape (1,)\n", | |
" # loss.item() gets the scalar value held in the loss.\n", | |
" total_loss += (y_pred - Y[i]).pow(2).sum()\n", | |
" \n", | |
"total_loss.backward()\n", | |
"\n", | |
"# Manually update weights using gradient descent. Wrap in torch.no_grad()\n", | |
"# because weights have requires_grad=True, but we don't need to track this\n", | |
"# in autograd.\n", | |
"with torch.no_grad():\n", | |
" e -= learning_rate * e.grad\n", | |
" f -= learning_rate * f.grad\n", | |
" g -= learning_rate * g.grad\n", | |
" h -= learning_rate * h.grad\n", | |
"\n", | |
" # # Manually zero the gradients after updating weights\n", | |
"\n", | |
"\n", | |
"print(f'Result: y = {e.item()} + {f.item()} x + {g.item()} x^2 + {h.item()} x^3')" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Result: y = -0.09324248135089874 + -0.17653831839561462 x + 0.21242240071296692 x^2 + 0.6993705630302429 x^3\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "4prg-P9C6X0g", | |
"outputId": "902e7c80-76b3-4f9b-8d48-9dd636a32e9a" | |
}, | |
"source": [ | |
"e.grad, f.grad, g.grad, h.grad" | |
], | |
"execution_count": 6, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(tensor(2655.7666),\n", | |
" tensor(113246.5156),\n", | |
" tensor(16697.2148),\n", | |
" tensor(812837.4375))" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 6 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "XF_txP5C8eAH" | |
}, | |
"source": [ | |
"### By calculating gradients on multiple batches without zeroing out the gradients and then take a step" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "JxyoPjcu6K1e", | |
"outputId": "5b3bb7f1-7245-4677-d78f-df9273d45556" | |
}, | |
"source": [ | |
"w, x, y, z = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n", | |
"w.requires_grad=True\n", | |
"x.requires_grad=True\n", | |
"y.requires_grad=True\n", | |
"z.requires_grad=True\n", | |
"\n", | |
"learning_rate = 1e-6\n", | |
"\n", | |
"for i in range(2000):\n", | |
" # Forward pass: compute predicted y using operations on Tensors.\n", | |
" y_pred = w + x * X[i] + y * X[i] ** 2 + z * X[i] ** 3\n", | |
"\n", | |
" # Compute and print loss using operations on Tensors.\n", | |
" # Now loss is a Tensor of shape (1,)\n", | |
" # loss.item() gets the scalar value held in the loss.\n", | |
" total_loss = (y_pred - Y[i]).pow(2).sum()\n", | |
" total_loss.backward()\n", | |
"\n", | |
"# Manually update weights using gradient descent. Wrap in torch.no_grad()\n", | |
"# because weights have requires_grad=True, but we don't need to track this\n", | |
"# in autograd.\n", | |
"with torch.no_grad():\n", | |
" w -= learning_rate * w.grad\n", | |
" x -= learning_rate * x.grad\n", | |
" y -= learning_rate * y.grad\n", | |
" z -= learning_rate * z.grad\n", | |
"\n", | |
" # # Manually zero the gradients after updating weights\n", | |
"\n", | |
"\n", | |
"print(f'Result: y = {w.item()} + {x.item()} x + {y.item()} x^2 + {z.item()} x^3')" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Result: y = -0.09324248880147934 + -0.176538348197937 x + 0.2124224454164505 x^2 + 0.6993707418441772 x^3\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "oTKTRahG69cl", | |
"outputId": "1b93af5d-b6ee-494f-f932-a730940e9d43" | |
}, | |
"source": [ | |
"w.grad, x.grad, y.grad, z.grad" | |
], | |
"execution_count": 8, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(tensor(2655.7747),\n", | |
" tensor(113246.5547),\n", | |
" tensor(16697.1738),\n", | |
" tensor(812837.2500))" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 8 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "zCAIhRs_8sM_" | |
}, | |
"source": [ | |
"### 4. Point 3. works if we divide the dataset into batches. " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "z2V-xUzM7mTb", | |
"outputId": "8bca2fc5-3e7f-4e00-a0fc-190f65fdfd61" | |
}, | |
"source": [ | |
"import numpy as np \n", | |
"w, x, y, z = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n", | |
"w.requires_grad=True\n", | |
"x.requires_grad=True\n", | |
"y.requires_grad=True\n", | |
"z.requires_grad=True\n", | |
"\n", | |
"learning_rate = 1e-6\n", | |
"batch_size =200\n", | |
"num_batches = int(np.ceil(X.shape[0]/batch_size))\n", | |
"\n", | |
"for i in range(num_batches):\n", | |
" # Forward pass: compute predicted y using operations on Tensors.\n", | |
" y_pred = w + x * X[i*batch_size: (i+1)*batch_size] + y * X[i*batch_size: (i+1)*batch_size] ** 2 + z * X[i*batch_size: (i+1)*batch_size] ** 3\n", | |
"\n", | |
" # Compute and print loss using operations on Tensors.\n", | |
" # Now loss is a Tensor of shape (1,)\n", | |
" # loss.item() gets the scalar value held in the loss.\n", | |
" total_loss = (y_pred - Y[i*batch_size: (i+1)*batch_size]).pow(2).sum()\n", | |
" total_loss.backward()\n", | |
"\n", | |
"# Manually update weights using gradient descent. Wrap in torch.no_grad()\n", | |
"# because weights have requires_grad=True, but we don't need to track this\n", | |
"# in autograd.\n", | |
"with torch.no_grad():\n", | |
" w -= learning_rate * w.grad\n", | |
" x -= learning_rate * x.grad\n", | |
" y -= learning_rate * y.grad\n", | |
" z -= learning_rate * z.grad\n", | |
"\n", | |
" # # Manually zero the gradients after updating weights\n", | |
"\n", | |
"\n", | |
"print(f'Result: y = {w.item()} + {x.item()} x + {y.item()} x^2 + {z.item()} x^3')" | |
], | |
"execution_count": 10, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Result: y = -0.09324248135089874 + -0.17653822898864746 x + 0.2124224752187729 x^2 + 0.6993708610534668 x^3\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "4THMzuTA7-EJ", | |
"outputId": "f45ffa00-2cbf-4ccc-858f-2507cb406347" | |
}, | |
"source": [ | |
"w.grad, x.grad, y.grad, z.grad" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"(tensor(2655.7646),\n", | |
" tensor(113246.4219),\n", | |
" tensor(16697.1328),\n", | |
" tensor(812837.1250))" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 11 | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_sqYSRsY80Xt" | |
}, | |
"source": [ | |
"### Ref: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html and https://discuss.pytorch.org/t/why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch/4903/20" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "qWc5UHml85Vj" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment