akaanirban · August 30, 2021 21:26
diff --git a/different-ways-to-perform-gradient-accumulation.ipynb b/different-ways-to-perform-gradient-accumulation.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Different ways to perform gradient accumulation.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyNfXRuve6/wKWr8q1FO9WQw",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/akaanirban/76427ef338d39b45f9858036b99f4ca3/different-ways-to-perform-gradient-accumulation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "BubD_LHm8FWT"
      },
      "source": [
        "## Different ways to perform gradient accumulation "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "iMo8XExI2Qob"
      },
      "source": [
        "import torch\n",
        "import math"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "waFqq-nE2Sfg"
      },
      "source": [
        "dtype = torch.float\n",
        "device = torch.device(\"cpu\")\n",
        "# device = torch.device(\"cuda:0\")  # Uncomment this to run on GPU"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "23pczZ6X8Ith"
      },
      "source": [
        "### 1. Normal full batch gradient"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "fbCl2u1y2kkJ",
        "outputId": "96fd895e-41c4-4c8d-90e2-2a1a6a3453bf"
      },
      "source": [
        "# Create Tensors to hold input and outputs.\n",
        "# By default, requires_grad=False, which indicates that we do not need to\n",
        "# compute gradients with respect to these Tensors during the backward pass.\n",
        "X = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)\n",
        "Y = torch.sin(X)\n",
        "\n",
        "# Create random Tensors for weights. For a third order polynomial, we need\n",
        "# 4 weights: y = a + b x + c x^2 + d x^3\n",
        "# Setting requires_grad=True indicates that we want to compute gradients with\n",
        "# respect to these Tensors during the backward pass.\n",
        "m = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
        "n = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
        "o = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
        "p = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
        "\n",
        "a, b, c, d = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
        "a.requires_grad=True\n",
        "b.requires_grad=True\n",
        "c.requires_grad=True\n",
        "d.requires_grad=True\n",
        "\n",
        "learning_rate = 1e-6\n",
        "for t in range(1):\n",
        "    # Forward pass: compute predicted y using operations on Tensors.\n",
        "    y_pred = a + b * X + c * X ** 2 + d * X ** 3\n",
        "\n",
        "    # Compute and print loss using operations on Tensors.\n",
        "    # Now loss is a Tensor of shape (1,)\n",
        "    # loss.item() gets the scalar value held in the loss.\n",
        "    loss = (y_pred - Y).pow(2).sum()\n",
        "    if t % 100 == 99:\n",
        "        print(t, loss.item())\n",
        "\n",
        "    # Use autograd to compute the backward pass. This call will compute the\n",
        "    # gradient of loss with respect to all Tensors with requires_grad=True.\n",
        "    # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding\n",
        "    # the gradient of the loss with respect to a, b, c, d respectively.\n",
        "    loss.backward()\n",
        "\n",
        "    # Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
        "    # because weights have requires_grad=True, but we don't need to track this\n",
        "    # in autograd.\n",
        "    with torch.no_grad():\n",
        "        a -= learning_rate * a.grad\n",
        "        b -= learning_rate * b.grad\n",
        "        c -= learning_rate * c.grad\n",
        "        d -= learning_rate * d.grad\n",
        "\n",
        "        # # Manually zero the gradients after updating weights\n",
        "\n",
        "\n",
        "print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Result: y = -0.09324247390031815 + -0.17653822898864746 x + 0.2124224752187729 x^2 + 0.699370801448822 x^3\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "SGuxppb82n1T",
        "outputId": "3a2631a6-b738-4e18-f3f2-f8746ff4ad29"
      },
      "source": [
        "a.grad, b.grad, c.grad, d.grad"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(tensor(2655.7627),\n",
              " tensor(113246.4297),\n",
              " tensor(16697.1406),\n",
              " tensor(812837.1875))"
            ]
          },
          "metadata": {},
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nOfCFx9_8OJB"
      },
      "source": [
        "### 2. By appending the loss to a total loss and then calculating the gradient on the total loss at the end."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6b8YcoJW3UKb",
        "outputId": "08fd1858-7d8f-47be-abe2-082bc63450f4"
      },
      "source": [
        "e, f, g, h = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
        "e.requires_grad=True\n",
        "f.requires_grad=True\n",
        "g.requires_grad=True\n",
        "h.requires_grad=True\n",
        "\n",
        "learning_rate = 1e-6\n",
        "total_loss = 0\n",
        "for i in range(2000):\n",
        "    # Forward pass: compute predicted y using operations on Tensors.\n",
        "    y_pred = e + f * X[i] + g * X[i] ** 2 + h * X[i] ** 3\n",
        "\n",
        "    # Compute and print loss using operations on Tensors.\n",
        "    # Now loss is a Tensor of shape (1,)\n",
        "    # loss.item() gets the scalar value held in the loss.\n",
        "    total_loss += (y_pred - Y[i]).pow(2).sum()\n",
        "    \n",
        "total_loss.backward()\n",
        "\n",
        "# Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
        "# because weights have requires_grad=True, but we don't need to track this\n",
        "# in autograd.\n",
        "with torch.no_grad():\n",
        "    e -= learning_rate * e.grad\n",
        "    f -= learning_rate * f.grad\n",
        "    g -= learning_rate * g.grad\n",
        "    h -= learning_rate * h.grad\n",
        "\n",
        "    # # Manually zero the gradients after updating weights\n",
        "\n",
        "\n",
        "print(f'Result: y = {e.item()} + {f.item()} x + {g.item()} x^2 + {h.item()} x^3')"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Result: y = -0.09324248135089874 + -0.17653831839561462 x + 0.21242240071296692 x^2 + 0.6993705630302429 x^3\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4prg-P9C6X0g",
        "outputId": "902e7c80-76b3-4f9b-8d48-9dd636a32e9a"
      },
      "source": [
        "e.grad, f.grad, g.grad, h.grad"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(tensor(2655.7666),\n",
              " tensor(113246.5156),\n",
              " tensor(16697.2148),\n",
              " tensor(812837.4375))"
            ]
          },
          "metadata": {},
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "XF_txP5C8eAH"
      },
      "source": [
        "### By calculating gradients on multiple batches without zeroing out the gradients and then take a step"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JxyoPjcu6K1e",
        "outputId": "5b3bb7f1-7245-4677-d78f-df9273d45556"
      },
      "source": [
        "w, x, y, z = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
        "w.requires_grad=True\n",
        "x.requires_grad=True\n",
        "y.requires_grad=True\n",
        "z.requires_grad=True\n",
        "\n",
        "learning_rate = 1e-6\n",
        "\n",
        "for i in range(2000):\n",
        "    # Forward pass: compute predicted y using operations on Tensors.\n",
        "    y_pred = w + x * X[i] + y * X[i] ** 2 + z * X[i] ** 3\n",
        "\n",
        "    # Compute and print loss using operations on Tensors.\n",
        "    # Now loss is a Tensor of shape (1,)\n",
        "    # loss.item() gets the scalar value held in the loss.\n",
        "    total_loss = (y_pred - Y[i]).pow(2).sum()\n",
        "    total_loss.backward()\n",
        "\n",
        "# Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
        "# because weights have requires_grad=True, but we don't need to track this\n",
        "# in autograd.\n",
        "with torch.no_grad():\n",
        "    w -= learning_rate * w.grad\n",
        "    x -= learning_rate * x.grad\n",
        "    y -= learning_rate * y.grad\n",
        "    z -= learning_rate * z.grad\n",
        "\n",
        "    # # Manually zero the gradients after updating weights\n",
        "\n",
        "\n",
        "print(f'Result: y = {w.item()} + {x.item()} x + {y.item()} x^2 + {z.item()} x^3')"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Result: y = -0.09324248880147934 + -0.176538348197937 x + 0.2124224454164505 x^2 + 0.6993707418441772 x^3\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "oTKTRahG69cl",
        "outputId": "1b93af5d-b6ee-494f-f932-a730940e9d43"
      },
      "source": [
        "w.grad, x.grad, y.grad, z.grad"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(tensor(2655.7747),\n",
              " tensor(113246.5547),\n",
              " tensor(16697.1738),\n",
              " tensor(812837.2500))"
            ]
          },
          "metadata": {},
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zCAIhRs_8sM_"
      },
      "source": [
        "### 4. Point 3. works if we divide the dataset into batches. "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "z2V-xUzM7mTb",
        "outputId": "8bca2fc5-3e7f-4e00-a0fc-190f65fdfd61"
      },
      "source": [
        "import numpy as np \n",
        "w, x, y, z = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
        "w.requires_grad=True\n",
        "x.requires_grad=True\n",
        "y.requires_grad=True\n",
        "z.requires_grad=True\n",
        "\n",
        "learning_rate = 1e-6\n",
        "batch_size =200\n",
        "num_batches = int(np.ceil(X.shape[0]/batch_size))\n",
        "\n",
        "for i in range(num_batches):\n",
        "    # Forward pass: compute predicted y using operations on Tensors.\n",
        "    y_pred = w + x * X[i*batch_size: (i+1)*batch_size] + y * X[i*batch_size: (i+1)*batch_size] ** 2 + z * X[i*batch_size: (i+1)*batch_size] ** 3\n",
        "\n",
        "    # Compute and print loss using operations on Tensors.\n",
        "    # Now loss is a Tensor of shape (1,)\n",
        "    # loss.item() gets the scalar value held in the loss.\n",
        "    total_loss = (y_pred - Y[i*batch_size: (i+1)*batch_size]).pow(2).sum()\n",
        "    total_loss.backward()\n",
        "\n",
        "# Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
        "# because weights have requires_grad=True, but we don't need to track this\n",
        "# in autograd.\n",
        "with torch.no_grad():\n",
        "    w -= learning_rate * w.grad\n",
        "    x -= learning_rate * x.grad\n",
        "    y -= learning_rate * y.grad\n",
        "    z -= learning_rate * z.grad\n",
        "\n",
        "    # # Manually zero the gradients after updating weights\n",
        "\n",
        "\n",
        "print(f'Result: y = {w.item()} + {x.item()} x + {y.item()} x^2 + {z.item()} x^3')"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Result: y = -0.09324248135089874 + -0.17653822898864746 x + 0.2124224752187729 x^2 + 0.6993708610534668 x^3\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4THMzuTA7-EJ",
        "outputId": "f45ffa00-2cbf-4ccc-858f-2507cb406347"
      },
      "source": [
        "w.grad, x.grad, y.grad, z.grad"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(tensor(2655.7646),\n",
              " tensor(113246.4219),\n",
              " tensor(16697.1328),\n",
              " tensor(812837.1250))"
            ]
          },
          "metadata": {},
          "execution_count": 11
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_sqYSRsY80Xt"
      },
      "source": [
        "### Ref: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html and https://discuss.pytorch.org/t/why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch/4903/20"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qWc5UHml85Vj"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "Different ways to perform gradient accumulation.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyNfXRuve6/wKWr8q1FO9WQw",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/akaanirban/76427ef338d39b45f9858036b99f4ca3/different-ways-to-perform-gradient-accumulation.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "BubD_LHm8FWT"
	},
	"source": [
	"## Different ways to perform gradient accumulation "
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "iMo8XExI2Qob"
	},
	"source": [
	"import torch\n",
	"import math"
	],
	"execution_count": 1,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "waFqq-nE2Sfg"
	},
	"source": [
	"dtype = torch.float\n",
	"device = torch.device(\"cpu\")\n",
	"# device = torch.device(\"cuda:0\") # Uncomment this to run on GPU"
	],
	"execution_count": 2,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "23pczZ6X8Ith"
	},
	"source": [
	"### 1. Normal full batch gradient"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "fbCl2u1y2kkJ",
	"outputId": "96fd895e-41c4-4c8d-90e2-2a1a6a3453bf"
	},
	"source": [
	"# Create Tensors to hold input and outputs.\n",
	"# By default, requires_grad=False, which indicates that we do not need to\n",
	"# compute gradients with respect to these Tensors during the backward pass.\n",
	"X = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)\n",
	"Y = torch.sin(X)\n",
	"\n",
	"# Create random Tensors for weights. For a third order polynomial, we need\n",
	"# 4 weights: y = a + b x + c x^2 + d x^3\n",
	"# Setting requires_grad=True indicates that we want to compute gradients with\n",
	"# respect to these Tensors during the backward pass.\n",
	"m = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
	"n = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
	"o = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
	"p = torch.randn((), device=device, dtype=dtype, requires_grad=True)\n",
	"\n",
	"a, b, c, d = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
	"a.requires_grad=True\n",
	"b.requires_grad=True\n",
	"c.requires_grad=True\n",
	"d.requires_grad=True\n",
	"\n",
	"learning_rate = 1e-6\n",
	"for t in range(1):\n",
	" # Forward pass: compute predicted y using operations on Tensors.\n",
	" y_pred = a + b * X + c * X ** 2 + d * X ** 3\n",
	"\n",
	" # Compute and print loss using operations on Tensors.\n",
	" # Now loss is a Tensor of shape (1,)\n",
	" # loss.item() gets the scalar value held in the loss.\n",
	" loss = (y_pred - Y).pow(2).sum()\n",
	" if t % 100 == 99:\n",
	" print(t, loss.item())\n",
	"\n",
	" # Use autograd to compute the backward pass. This call will compute the\n",
	" # gradient of loss with respect to all Tensors with requires_grad=True.\n",
	" # After this call a.grad, b.grad. c.grad and d.grad will be Tensors holding\n",
	" # the gradient of the loss with respect to a, b, c, d respectively.\n",
	" loss.backward()\n",
	"\n",
	" # Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
	" # because weights have requires_grad=True, but we don't need to track this\n",
	" # in autograd.\n",
	" with torch.no_grad():\n",
	" a -= learning_rate * a.grad\n",
	" b -= learning_rate * b.grad\n",
	" c -= learning_rate * c.grad\n",
	" d -= learning_rate * d.grad\n",
	"\n",
	" # # Manually zero the gradients after updating weights\n",
	"\n",
	"\n",
	"print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')"
	],
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Result: y = -0.09324247390031815 + -0.17653822898864746 x + 0.2124224752187729 x^2 + 0.699370801448822 x^3\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "SGuxppb82n1T",
	"outputId": "3a2631a6-b738-4e18-f3f2-f8746ff4ad29"
	},
	"source": [
	"a.grad, b.grad, c.grad, d.grad"
	],
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(tensor(2655.7627),\n",
	" tensor(113246.4297),\n",
	" tensor(16697.1406),\n",
	" tensor(812837.1875))"
	]
	},
	"metadata": {},
	"execution_count": 4
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "nOfCFx9_8OJB"
	},
	"source": [
	"### 2. By appending the loss to a total loss and then calculating the gradient on the total loss at the end."
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "6b8YcoJW3UKb",
	"outputId": "08fd1858-7d8f-47be-abe2-082bc63450f4"
	},
	"source": [
	"e, f, g, h = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
	"e.requires_grad=True\n",
	"f.requires_grad=True\n",
	"g.requires_grad=True\n",
	"h.requires_grad=True\n",
	"\n",
	"learning_rate = 1e-6\n",
	"total_loss = 0\n",
	"for i in range(2000):\n",
	" # Forward pass: compute predicted y using operations on Tensors.\n",
	" y_pred = e + f * X[i] + g * X[i] ** 2 + h * X[i] ** 3\n",
	"\n",
	" # Compute and print loss using operations on Tensors.\n",
	" # Now loss is a Tensor of shape (1,)\n",
	" # loss.item() gets the scalar value held in the loss.\n",
	" total_loss += (y_pred - Y[i]).pow(2).sum()\n",
	" \n",
	"total_loss.backward()\n",
	"\n",
	"# Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
	"# because weights have requires_grad=True, but we don't need to track this\n",
	"# in autograd.\n",
	"with torch.no_grad():\n",
	" e -= learning_rate * e.grad\n",
	" f -= learning_rate * f.grad\n",
	" g -= learning_rate * g.grad\n",
	" h -= learning_rate * h.grad\n",
	"\n",
	" # # Manually zero the gradients after updating weights\n",
	"\n",
	"\n",
	"print(f'Result: y = {e.item()} + {f.item()} x + {g.item()} x^2 + {h.item()} x^3')"
	],
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Result: y = -0.09324248135089874 + -0.17653831839561462 x + 0.21242240071296692 x^2 + 0.6993705630302429 x^3\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "4prg-P9C6X0g",
	"outputId": "902e7c80-76b3-4f9b-8d48-9dd636a32e9a"
	},
	"source": [
	"e.grad, f.grad, g.grad, h.grad"
	],
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(tensor(2655.7666),\n",
	" tensor(113246.5156),\n",
	" tensor(16697.2148),\n",
	" tensor(812837.4375))"
	]
	},
	"metadata": {},
	"execution_count": 6
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "XF_txP5C8eAH"
	},
	"source": [
	"### By calculating gradients on multiple batches without zeroing out the gradients and then take a step"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "JxyoPjcu6K1e",
	"outputId": "5b3bb7f1-7245-4677-d78f-df9273d45556"
	},
	"source": [
	"w, x, y, z = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
	"w.requires_grad=True\n",
	"x.requires_grad=True\n",
	"y.requires_grad=True\n",
	"z.requires_grad=True\n",
	"\n",
	"learning_rate = 1e-6\n",
	"\n",
	"for i in range(2000):\n",
	" # Forward pass: compute predicted y using operations on Tensors.\n",
	" y_pred = w + x * X[i] + y * X[i] ** 2 + z * X[i] ** 3\n",
	"\n",
	" # Compute and print loss using operations on Tensors.\n",
	" # Now loss is a Tensor of shape (1,)\n",
	" # loss.item() gets the scalar value held in the loss.\n",
	" total_loss = (y_pred - Y[i]).pow(2).sum()\n",
	" total_loss.backward()\n",
	"\n",
	"# Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
	"# because weights have requires_grad=True, but we don't need to track this\n",
	"# in autograd.\n",
	"with torch.no_grad():\n",
	" w -= learning_rate * w.grad\n",
	" x -= learning_rate * x.grad\n",
	" y -= learning_rate * y.grad\n",
	" z -= learning_rate * z.grad\n",
	"\n",
	" # # Manually zero the gradients after updating weights\n",
	"\n",
	"\n",
	"print(f'Result: y = {w.item()} + {x.item()} x + {y.item()} x^2 + {z.item()} x^3')"
	],
	"execution_count": 7,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Result: y = -0.09324248880147934 + -0.176538348197937 x + 0.2124224454164505 x^2 + 0.6993707418441772 x^3\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "oTKTRahG69cl",
	"outputId": "1b93af5d-b6ee-494f-f932-a730940e9d43"
	},
	"source": [
	"w.grad, x.grad, y.grad, z.grad"
	],
	"execution_count": 8,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(tensor(2655.7747),\n",
	" tensor(113246.5547),\n",
	" tensor(16697.1738),\n",
	" tensor(812837.2500))"
	]
	},
	"metadata": {},
	"execution_count": 8
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "zCAIhRs_8sM_"
	},
	"source": [
	"### 4. Point 3. works if we divide the dataset into batches. "
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "z2V-xUzM7mTb",
	"outputId": "8bca2fc5-3e7f-4e00-a0fc-190f65fdfd61"
	},
	"source": [
	"import numpy as np \n",
	"w, x, y, z = m.clone().detach(), n.clone().detach(), o.clone().detach(), p.clone().detach()\n",
	"w.requires_grad=True\n",
	"x.requires_grad=True\n",
	"y.requires_grad=True\n",
	"z.requires_grad=True\n",
	"\n",
	"learning_rate = 1e-6\n",
	"batch_size =200\n",
	"num_batches = int(np.ceil(X.shape[0]/batch_size))\n",
	"\n",
	"for i in range(num_batches):\n",
	" # Forward pass: compute predicted y using operations on Tensors.\n",
	" y_pred = w + x * X[ibatch_size: (i+1)batch_size] + y * X[ibatch_size: (i+1)batch_size] ** 2 + z * X[ibatch_size: (i+1)batch_size] ** 3\n",
	"\n",
	" # Compute and print loss using operations on Tensors.\n",
	" # Now loss is a Tensor of shape (1,)\n",
	" # loss.item() gets the scalar value held in the loss.\n",
	" total_loss = (y_pred - Y[ibatch_size: (i+1)batch_size]).pow(2).sum()\n",
	" total_loss.backward()\n",
	"\n",
	"# Manually update weights using gradient descent. Wrap in torch.no_grad()\n",
	"# because weights have requires_grad=True, but we don't need to track this\n",
	"# in autograd.\n",
	"with torch.no_grad():\n",
	" w -= learning_rate * w.grad\n",
	" x -= learning_rate * x.grad\n",
	" y -= learning_rate * y.grad\n",
	" z -= learning_rate * z.grad\n",
	"\n",
	" # # Manually zero the gradients after updating weights\n",
	"\n",
	"\n",
	"print(f'Result: y = {w.item()} + {x.item()} x + {y.item()} x^2 + {z.item()} x^3')"
	],
	"execution_count": 10,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Result: y = -0.09324248135089874 + -0.17653822898864746 x + 0.2124224752187729 x^2 + 0.6993708610534668 x^3\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "4THMzuTA7-EJ",
	"outputId": "f45ffa00-2cbf-4ccc-858f-2507cb406347"
	},
	"source": [
	"w.grad, x.grad, y.grad, z.grad"
	],
	"execution_count": 11,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(tensor(2655.7646),\n",
	" tensor(113246.4219),\n",
	" tensor(16697.1328),\n",
	" tensor(812837.1250))"
	]
	},
	"metadata": {},
	"execution_count": 11
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "_sqYSRsY80Xt"
	},
	"source": [
	"### Ref: https://pytorch.org/tutorials/beginner/pytorch_with_examples.html and https://discuss.pytorch.org/t/why-do-we-need-to-set-the-gradients-manually-to-zero-in-pytorch/4903/20"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "qWc5UHml85Vj"
	},
	"source": [
	""
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}