riveSunder · November 5, 2020 04:55
diff --git a/gol_pytorch_benchmarks.ipynb b/gol_pytorch_benchmarks.ipynb
 {
 "cells": [
  {{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import time\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Speeding Up Conway's Game of Life with PyTorch\n",
    "\n",
    "Cellular automata are the basic building blocks of a massively parallel computing paradigm capable of high levels of complexity and Turing completeness, even for exceedingly simple rule formulations[[0](https://en.wikipedia.org/wiki/Rule_110)]. The most well-known CA universe is undoubtedly John Horton Conway's Game of Life. CA were the subject of much interest in the 1980s and 1990s as computational capabilities made studying complexity more interesting*. \n",
    "\n",
    "CA are suddenly very interesting again (counter: they never stopped being cool) thanks to projects training differentiable CA as an abstract model of ontogenesis[[1](https://distill.pub/2020/growing-ca/)], self-classifying pixels[[2](https://distill.pub/2020/selforg/mnist/)], and connecting CA to convolutional neural networks[[3](https://arxiv.org/abs/1809.02942)], among others.\n",
    "\n",
    "Even in the era of modern machine learning, typically considered to be synonomous with deep learning, CA have plenty of potential to shake things up. The CA approach will make even more sense as the next generation of computational hardware acceleration trends toward designs closely reminiscent to the mathematical formulations of CA universes (<em>e.g.</em> systolic arrays). Conveniently, we can use the tools of deep learning to build CA universes. By formulating CA universe updates in terms of convolutions and matrix multiplies, we can implement them in libraries like PyTorch or TensorFlow taking advantage of computational speed-ups from GPU support and optimized code. As an extra bonus, continuous-valued and differentiable CA universes become possible, making for an exciting new world of computational playgrounds.\n",
    "\n",
    "This notebook contains code demonstrating how to formulate Conway's Game of Life in terms of convolutional layers implemented in PyTorch as well as a loop-based formulation, and shows just how much faster a CA universe can run in the former case. It's intended as a starting point to begin experimenting with your own flexible CA models. \n",
    "\n",
    "Note that if you are mostly interested in building machines in existing CA universes, I highly recommend you check out [Golly](http://golly.sourceforge.net/). Golly is an open source program for experimenting with many different types of CA including Wolfram's 1D CA, Langton's Ant, and many others. Golly is fast, and it uses memoization algorithms like [HashLife](https://en.wikipedia.org/wiki/Hashlife) to take advantage of temporally repeated patterns for additional speed-up.\n",
    "https://en.wikipedia.org/wiki/Hashlife\n",
    "\n",
    "*Keep in mind that Conways devised Game of Life on a Go board during coffee breaks and Von Neumann designed his 29-state CA on pen and paper. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PyTorch and naive implementations of GoL\n",
    "\n",
    "def gol_step(grid, n=1, device=\"cpu\"):\n",
    "    \n",
    "\n",
    "    if torch.cuda.is_available():\n",
    "        device = device\n",
    "    else:\n",
    "        device = \"cpu\"\n",
    "    \n",
    "    my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).unsqueeze(0).unsqueeze(0).float().to(device)\n",
    "    old_grid = grid.float().to(device)\n",
    "  \n",
    "    while n > 0:\n",
    "        temp_grid = F.conv2d(old_grid, my_kernel, padding=1)#[:,:,1:-1,1:-1]\n",
    "        new_grid = torch.zeros_like(old_grid)\n",
    "\n",
    "        new_grid[temp_grid == 3] = 1 \n",
    "        new_grid[old_grid*temp_grid == 2] = 1\n",
    "        \n",
    "        old_grid = new_grid.clone()\n",
    "\n",
    "        n -= 1\n",
    "    #if n > 0:\n",
    "    #    new_grid = gol_step(new_grid, n=n)\n",
    "    \n",
    "    return new_grid.to(\"cpu\")\n",
    "\n",
    "\n",
    "def gol_loop(grid, n=1):\n",
    "    \n",
    "    old_grid = grid.squeeze().int()\n",
    "    dim_x, dim_y = old_grid.shape\n",
    "    my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).int()\n",
    "    \n",
    "    while n > 0:\n",
    "        \n",
    "        new_grid = torch.zeros_like(old_grid)\n",
    "        temp_grid = torch.zeros_like(old_grid)\n",
    "        for xx in range(dim_x):\n",
    "            for yy in range(dim_y):\n",
    "                temp_sum = 0\n",
    "                \n",
    "                y_stop = 3 if yy < (dim_y-1) else -1\n",
    "                x_stop = 3 if xx < (dim_x-1) else -1\n",
    "                \n",
    "                temp_sum = torch.sum(my_kernel[\\\n",
    "                                     1*(not(xx>0)):x_stop,\\\n",
    "                                     1*(not(yy>0)):y_stop] * old_grid[\\\n",
    "                                                    max(0, xx-1):min(dim_x, xx+2),\\\n",
    "                                                    max(0, yy-1):min(dim_y, yy+2)])\n",
    "                \n",
    "                temp_grid[xx,yy] = temp_sum\n",
    "\n",
    "        new_grid[temp_grid == 3] = 1 \n",
    "        new_grid[old_grid*temp_grid == 2] = 1\n",
    "        \n",
    "        old_grid = new_grid.clone()\n",
    "\n",
    "        n -= 1\n",
    "    #if n > 0:\n",
    "    #    new_grid = gol_step(new_grid, n=n)\n",
    "    \n",
    "    return new_grid\n",
    "    \n",
    "          "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# run benchmarks\n",
    "\n",
    "if(0): \n",
    "    # draw glider\n",
    "    grid = torch.zeros(1,1,2048,2048) #256,256)\n",
    "\n",
    "    grid[0,0,19,17] = 1\n",
    "    grid[0,0,18,18] = 1\n",
    "    grid[0,0,17,18] = 1\n",
    "    grid[0,0,17,17] = 1\n",
    "    grid[0,0,17,16] = 1\n",
    "\n",
    "grid = 1.0 * (torch.rand(1,1,2048,2048) > 0.50)\n",
    "\n",
    "for num_steps in [1, 6, 60, 600, 6000]:\n",
    "    #grid = 1.0 * (torch.rand(1,1,64,64) > 0.50)\n",
    "    \n",
    "    if num_steps < 601:\n",
    "        t0 = time.time()     \n",
    "        grid = gol_loop(grid, n=num_steps)\n",
    "        t1 = time.time()\n",
    "        print(\"time for {} gol_loop steps = {:.2e}\".format(num_steps, t1-t0))\n",
    "\n",
    "    \n",
    "    grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)\n",
    "\n",
    "    t2 = time.time()     \n",
    "    grid = gol_step(grid, n=num_steps)\n",
    "    t3 = time.time()\n",
    "    print(\"(cpu) time for {} gol steps = {:.2e}\".format(num_steps, t3-t2))\n",
    "    \n",
    "    if num_steps < 601:\n",
    "        print(\"loop/pytorch = {:.4e}\".format((t1-t0) / (t3-t2)))\n",
    "    \n",
    "    grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)\n",
    "\n",
    "    t4 = time.time()     \n",
    "    grid = gol_step(grid, n=num_steps, device=\"cuda\")\n",
    "    t5 = time.time()\n",
    "    print(\"(gpu) time for {} gol steps = {:.2e}\".format(num_steps, t5-t4))\n",
    "    if num_steps < 601:\n",
    "        print(\"loop/pytorch = {:.4e}, loop/gpu_pytorch = {:.4e} pytorch/gpu_pytorch = {:.4e}\"\\\n",
    "              .format((t1-t0) / (t3-t2), (t1-t0) / (t5-t4), (t3-t2) / (t5-t4) ))\n",
    "    else:\n",
    "        print(\"pytorch/gpu_pytorch = {:.4e}\".format((t3-t2) / (t5-t4) ))\n",
    "    \n",
    "    \n",
    "plt.figure()\n",
    "plt.imshow(grid.squeeze())\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import time\n",
    "\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Speeding Up Conway's Game of Life with PyTorch\n",
    "\n",
    "Cellular automata are the basic building blocks of a massively parallel computing paradigm capable of high levels of complexity and Turing completeness, even for exceedingly simple rule formulations[[0](https://en.wikipedia.org/wiki/Rule_110)]. The most well-known CA universe is undoubtedly John Horton Conway's Game of Life. CA were the subject of much interest in the 1980s and 1990s as computational capabilities made studying complexity more interesting*. \n",
    "\n",
    "CA are suddenly very interesting again (counter: they never stopped being cool) thanks to projects training differentiable CA as an abstract model of ontogenesis[[1](https://distill.pub/2020/growing-ca/)], self-classifying pixels[[2](https://distill.pub/2020/selforg/mnist/)], and connecting CA to convolutional neural networks[[3](https://arxiv.org/abs/1809.02942)], among others.\n",
    "\n",
    "Even in the era of modern machine learning, typically considered to be synonomous with deep learning, CA have plenty of potential to shake things up. The CA approach will make even more sense as the next generation of computational hardware acceleration trends toward designs closely reminiscent to the mathematical formulations of CA universes (<em>e.g.</em> systolic arrays). Conveniently, we can use the tools of deep learning to build CA universes. By formulating CA universe updates in terms of convolutions and matrix multiplies, we can implement them in libraries like PyTorch or TensorFlow taking advantage of computational speed-ups from GPU support and optimized code. As an extra bonus, continuous-valued and differentiable CA universes become possible, making for an exciting new world of computational playgrounds.\n",
    "\n",
    "This notebook contains code demonstrating how to formulate Conway's Game of Life in terms of convolutional layers implemented in PyTorch as well as a loop-based formulation, and shows just how much faster a CA universe can run in the former case. It's intended as a starting point to begin experimenting with your own flexible CA models. \n",
    "\n",
    "Note that if you are mostly interested in building machines in existing CA universes, I highly recommend you check out [Golly](http://golly.sourceforge.net/). Golly is an open source program for experimenting with many different types of CA including Wolfram's 1D CA, Langton's Ant, and many others. Golly is fast, and it uses memoization algorithms like [HashLife](https://en.wikipedia.org/wiki/Hashlife) to take advantage of temporally repeated patterns for additional speed-up.\n",
    "https://en.wikipedia.org/wiki/Hashlife\n",
    "\n",
    "*Keep in mind that Conways devised Game of Life on a Go board during coffee breaks and Von Neumann designed his 29-state CA on pen and paper. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# PyTorch and naive implementations of GoL\n",
    "\n",
    "def gol_step(grid, n=1, device=\"cpu\"):\n",
    "    \n",
    "\n",
    "    if torch.cuda.is_available():\n",
    "        device = device\n",
    "    else:\n",
    "        device = \"cpu\"\n",
    "    \n",
    "    my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).unsqueeze(0).unsqueeze(0).float().to(device)\n",
    "    old_grid = grid.float().to(device)\n",
    "  \n",
    "    while n > 0:\n",
    "        temp_grid = F.conv2d(old_grid, my_kernel, padding=1)#[:,:,1:-1,1:-1]\n",
    "        new_grid = torch.zeros_like(old_grid)\n",
    "\n",
    "        new_grid[temp_grid == 3] = 1 \n",
    "        new_grid[old_grid*temp_grid == 2] = 1\n",
    "        \n",
    "        old_grid = new_grid.clone()\n",
    "\n",
    "        n -= 1\n",
    "    #if n > 0:\n",
    "    #    new_grid = gol_step(new_grid, n=n)\n",
    "    \n",
    "    return new_grid.to(\"cpu\")\n",
    "\n",
    "\n",
    "def gol_loop(grid, n=1):\n",
    "    \n",
    "    old_grid = grid.squeeze().int()\n",
    "    dim_x, dim_y = old_grid.shape\n",
    "    my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).int()\n",
    "    \n",
    "    while n > 0:\n",
    "        \n",
    "        new_grid = torch.zeros_like(old_grid)\n",
    "        temp_grid = torch.zeros_like(old_grid)\n",
    "        for xx in range(dim_x):\n",
    "            for yy in range(dim_y):\n",
    "                temp_sum = 0\n",
    "                \n",
    "                y_stop = 3 if yy < (dim_y-1) else -1\n",
    "                x_stop = 3 if xx < (dim_x-1) else -1\n",
    "                \n",
    "                temp_sum = torch.sum(my_kernel[\\\n",
    "                                     1*(not(xx>0)):x_stop,\\\n",
    "                                     1*(not(yy>0)):y_stop] * old_grid[\\\n",
    "                                                    max(0, xx-1):min(dim_x, xx+2),\\\n",
    "                                                    max(0, yy-1):min(dim_y, yy+2)])\n",
    "                \n",
    "                temp_grid[xx,yy] = temp_sum\n",
    "\n",
    "        new_grid[temp_grid == 3] = 1 \n",
    "        new_grid[old_grid*temp_grid == 2] = 1\n",
    "        \n",
    "        old_grid = new_grid.clone()\n",
    "\n",
    "        n -= 1\n",
    "    #if n > 0:\n",
    "    #    new_grid = gol_step(new_grid, n=n)\n",
    "    \n",
    "    return new_grid\n",
    "    \n",
    "          "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# run benchmarks\n",
    "\n",
    "if(0): \n",
    "    # draw glider\n",
    "    grid = torch.zeros(1,1,2048,2048) #256,256)\n",
    "\n",
    "    grid[0,0,19,17] = 1\n",
    "    grid[0,0,18,18] = 1\n",
    "    grid[0,0,17,18] = 1\n",
    "    grid[0,0,17,17] = 1\n",
    "    grid[0,0,17,16] = 1\n",
    "\n",
    "grid = 1.0 * (torch.rand(1,1,2048,2048) > 0.50)\n",
    "\n",
    "for num_steps in [1, 6, 60, 600, 6000]:\n",
    "    #grid = 1.0 * (torch.rand(1,1,64,64) > 0.50)\n",
    "    \n",
    "    if num_steps < 601:\n",
    "        t0 = time.time()     \n",
    "        grid = gol_loop(grid, n=num_steps)\n",
    "        t1 = time.time()\n",
    "        print(\"time for {} gol_loop steps = {:.2e}\".format(num_steps, t1-t0))\n",
    "\n",
    "    \n",
    "    grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)\n",
    "\n",
    "    t2 = time.time()     \n",
    "    grid = gol_step(grid, n=num_steps)\n",
    "    t3 = time.time()\n",
    "    print(\"(cpu) time for {} gol steps = {:.2e}\".format(num_steps, t3-t2))\n",
    "    \n",
    "    if num_steps < 601:\n",
    "        print(\"loop/pytorch = {:.4e}\".format((t1-t0) / (t3-t2)))\n",
    "    \n",
    "    grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)\n",
    "\n",
    "    t4 = time.time()     \n",
    "    grid = gol_step(grid, n=num_steps, device=\"cuda\")\n",
    "    t5 = time.time()\n",
    "    print(\"(gpu) time for {} gol steps = {:.2e}\".format(num_steps, t5-t4))\n",
    "    if num_steps < 601:\n",
    "        print(\"loop/pytorch = {:.4e}, loop/gpu_pytorch = {:.4e} pytorch/gpu_pytorch = {:.4e}\"\\\n",
    "              .format((t1-t0) / (t3-t2), (t1-t0) / (t5-t4), (t3-t2) / (t5-t4) ))\n",
    "    else:\n",
    "        print(\"pytorch/gpu_pytorch = {:.4e}\".format((t3-t2) / (t5-t4) ))\n",
    "    \n",
    "    \n",
    "plt.figure()\n",
    "plt.imshow(grid.squeeze())\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
diff --git a/gol_pytorch_benchmarks.py b/gol_pytorch_benchmarks.py
 import numpy as np
 import matplotlib.pyplot as plt
 import numpy as np
 import time

 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 # PyTorch and naive implementations of GoL

 def gol_step(grid, n=1, device="cpu"):
    

    if torch.cuda.is_available():
        device = device
    else:
        device = "cpu"
    
    my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).unsqueeze(0).unsqueeze(0).float().to(device)
    old_grid = grid.float().to(device)
  
    while n > 0:
        temp_grid = F.conv2d(old_grid, my_kernel, padding=1)#[:,:,1:-1,1:-1]
        new_grid = torch.zeros_like(old_grid)

        new_grid[temp_grid == 3] = 1 
        new_grid[old_grid*temp_grid == 2] = 1
        
        old_grid = new_grid.clone()

        n -= 1
    #if n > 0:
    #    new_grid = gol_step(new_grid, n=n)
    
    return new_grid.to("cpu")


 def gol_loop(grid, n=1):
    
    old_grid = grid.squeeze().int()
    dim_x, dim_y = old_grid.shape
    my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).int()
    
    while n > 0:
        
        new_grid = torch.zeros_like(old_grid)
        temp_grid = torch.zeros_like(old_grid)
        for xx in range(dim_x):
            for yy in range(dim_y):
                temp_sum = 0
                
                y_stop = 3 if yy < (dim_y-1) else -1
                x_stop = 3 if xx < (dim_x-1) else -1
                
                temp_sum = torch.sum(my_kernel[\
                                     1*(not(xx>0)):x_stop,\
                                     1*(not(yy>0)):y_stop] * old_grid[\
                                                    max(0, xx-1):min(dim_x, xx+2),\
                                                    max(0, yy-1):min(dim_y, yy+2)])
                
                temp_grid[xx,yy] = temp_sum

        new_grid[temp_grid == 3] = 1 
        new_grid[old_grid*temp_grid == 2] = 1
        
        old_grid = new_grid.clone()

        n -= 1
    #if n > 0:
    #    new_grid = gol_step(new_grid, n=n)
    
    return new_grid
    
 if __name__ == "__main__":
    # run benchmarks

  if(0): 
      # draw glider
      grid = torch.zeros(1,1,2048,2048) #256,256)

      grid[0,0,19,17] = 1
      grid[0,0,18,18] = 1
      grid[0,0,17,18] = 1
      grid[0,0,17,17] = 1
      grid[0,0,17,16] = 1

  grid = 1.0 * (torch.rand(1,1,2048,2048) > 0.50)

  for num_steps in [1, 6, 60, 600, 6000]:
      #grid = 1.0 * (torch.rand(1,1,64,64) > 0.50)

      if num_steps < 601:
          t0 = time.time()     
          grid = gol_loop(grid, n=num_steps)
          t1 = time.time()
          print("time for {} gol_loop steps = {:.2e}".format(num_steps, t1-t0))


      grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)

      t2 = time.time()     
      grid = gol_step(grid, n=num_steps)
      t3 = time.time()
      print("(cpu) time for {} gol steps = {:.2e}".format(num_steps, t3-t2))

      if num_steps < 601:
          print("loop/pytorch = {:.4e}".format((t1-t0) / (t3-t2)))

      grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)

      t4 = time.time()     
      grid = gol_step(grid, n=num_steps, device="cuda")
      t5 = time.time()
      print("(gpu) time for {} gol steps = {:.2e}".format(num_steps, t5-t4))
      if num_steps < 601:
          print("loop/pytorch = {:.4e}, loop/gpu_pytorch = {:.4e} pytorch/gpu_pytorch = {:.4e}"\
                .format((t1-t0) / (t3-t2), (t1-t0) / (t5-t4), (t3-t2) / (t5-t4) ))
      else:
          print("pytorch/gpu_pytorch = {:.4e}".format((t3-t2) / (t5-t4) ))
	{
	"cells": [
	{{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import numpy as np\n",
	"import time\n",
	"\n",
	"import torch\n",
	"import torch.nn as nn\n",
	"import torch.nn.functional as F"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Speeding Up Conway's Game of Life with PyTorch\n",
	"\n",
	"Cellular automata are the basic building blocks of a massively parallel computing paradigm capable of high levels of complexity and Turing completeness, even for exceedingly simple rule formulations[[0](https://en.wikipedia.org/wiki/Rule_110)]. The most well-known CA universe is undoubtedly John Horton Conway's Game of Life. CA were the subject of much interest in the 1980s and 1990s as computational capabilities made studying complexity more interesting*. \n",
	"\n",
	"CA are suddenly very interesting again (counter: they never stopped being cool) thanks to projects training differentiable CA as an abstract model of ontogenesis[[1](https://distill.pub/2020/growing-ca/)], self-classifying pixels[[2](https://distill.pub/2020/selforg/mnist/)], and connecting CA to convolutional neural networks[[3](https://arxiv.org/abs/1809.02942)], among others.\n",
	"\n",
	"Even in the era of modern machine learning, typically considered to be synonomous with deep learning, CA have plenty of potential to shake things up. The CA approach will make even more sense as the next generation of computational hardware acceleration trends toward designs closely reminiscent to the mathematical formulations of CA universes (<em>e.g.</em> systolic arrays). Conveniently, we can use the tools of deep learning to build CA universes. By formulating CA universe updates in terms of convolutions and matrix multiplies, we can implement them in libraries like PyTorch or TensorFlow taking advantage of computational speed-ups from GPU support and optimized code. As an extra bonus, continuous-valued and differentiable CA universes become possible, making for an exciting new world of computational playgrounds.\n",
	"\n",
	"This notebook contains code demonstrating how to formulate Conway's Game of Life in terms of convolutional layers implemented in PyTorch as well as a loop-based formulation, and shows just how much faster a CA universe can run in the former case. It's intended as a starting point to begin experimenting with your own flexible CA models. \n",
	"\n",
	"Note that if you are mostly interested in building machines in existing CA universes, I highly recommend you check out [Golly](http://golly.sourceforge.net/). Golly is an open source program for experimenting with many different types of CA including Wolfram's 1D CA, Langton's Ant, and many others. Golly is fast, and it uses memoization algorithms like [HashLife](https://en.wikipedia.org/wiki/Hashlife) to take advantage of temporally repeated patterns for additional speed-up.\n",
	"https://en.wikipedia.org/wiki/Hashlife\n",
	"\n",
	"*Keep in mind that Conways devised Game of Life on a Go board during coffee breaks and Von Neumann designed his 29-state CA on pen and paper. \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# PyTorch and naive implementations of GoL\n",
	"\n",
	"def gol_step(grid, n=1, device=\"cpu\"):\n",
	" \n",
	"\n",
	" if torch.cuda.is_available():\n",
	" device = device\n",
	" else:\n",
	" device = \"cpu\"\n",
	" \n",
	" my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).unsqueeze(0).unsqueeze(0).float().to(device)\n",
	" old_grid = grid.float().to(device)\n",
	" \n",
	" while n > 0:\n",
	" temp_grid = F.conv2d(old_grid, my_kernel, padding=1)#[:,:,1:-1,1:-1]\n",
	" new_grid = torch.zeros_like(old_grid)\n",
	"\n",
	" new_grid[temp_grid == 3] = 1 \n",
	" new_grid[old_grid*temp_grid == 2] = 1\n",
	" \n",
	" old_grid = new_grid.clone()\n",
	"\n",
	" n -= 1\n",
	" #if n > 0:\n",
	" # new_grid = gol_step(new_grid, n=n)\n",
	" \n",
	" return new_grid.to(\"cpu\")\n",
	"\n",
	"\n",
	"def gol_loop(grid, n=1):\n",
	" \n",
	" old_grid = grid.squeeze().int()\n",
	" dim_x, dim_y = old_grid.shape\n",
	" my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).int()\n",
	" \n",
	" while n > 0:\n",
	" \n",
	" new_grid = torch.zeros_like(old_grid)\n",
	" temp_grid = torch.zeros_like(old_grid)\n",
	" for xx in range(dim_x):\n",
	" for yy in range(dim_y):\n",
	" temp_sum = 0\n",
	" \n",
	" y_stop = 3 if yy < (dim_y-1) else -1\n",
	" x_stop = 3 if xx < (dim_x-1) else -1\n",
	" \n",
	" temp_sum = torch.sum(my_kernel[\\\n",
	" 1*(not(xx>0)):x_stop,\\\n",
	" 1(not(yy>0)):y_stop] old_grid[\\\n",
	" max(0, xx-1):min(dim_x, xx+2),\\\n",
	" max(0, yy-1):min(dim_y, yy+2)])\n",
	" \n",
	" temp_grid[xx,yy] = temp_sum\n",
	"\n",
	" new_grid[temp_grid == 3] = 1 \n",
	" new_grid[old_grid*temp_grid == 2] = 1\n",
	" \n",
	" old_grid = new_grid.clone()\n",
	"\n",
	" n -= 1\n",
	" #if n > 0:\n",
	" # new_grid = gol_step(new_grid, n=n)\n",
	" \n",
	" return new_grid\n",
	" \n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# run benchmarks\n",
	"\n",
	"if(0): \n",
	" # draw glider\n",
	" grid = torch.zeros(1,1,2048,2048) #256,256)\n",
	"\n",
	" grid[0,0,19,17] = 1\n",
	" grid[0,0,18,18] = 1\n",
	" grid[0,0,17,18] = 1\n",
	" grid[0,0,17,17] = 1\n",
	" grid[0,0,17,16] = 1\n",
	"\n",
	"grid = 1.0 * (torch.rand(1,1,2048,2048) > 0.50)\n",
	"\n",
	"for num_steps in [1, 6, 60, 600, 6000]:\n",
	" #grid = 1.0 * (torch.rand(1,1,64,64) > 0.50)\n",
	" \n",
	" if num_steps < 601:\n",
	" t0 = time.time() \n",
	" grid = gol_loop(grid, n=num_steps)\n",
	" t1 = time.time()\n",
	" print(\"time for {} gol_loop steps = {:.2e}\".format(num_steps, t1-t0))\n",
	"\n",
	" \n",
	" grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)\n",
	"\n",
	" t2 = time.time() \n",
	" grid = gol_step(grid, n=num_steps)\n",
	" t3 = time.time()\n",
	" print(\"(cpu) time for {} gol steps = {:.2e}\".format(num_steps, t3-t2))\n",
	" \n",
	" if num_steps < 601:\n",
	" print(\"loop/pytorch = {:.4e}\".format((t1-t0) / (t3-t2)))\n",
	" \n",
	" grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)\n",
	"\n",
	" t4 = time.time() \n",
	" grid = gol_step(grid, n=num_steps, device=\"cuda\")\n",
	" t5 = time.time()\n",
	" print(\"(gpu) time for {} gol steps = {:.2e}\".format(num_steps, t5-t4))\n",
	" if num_steps < 601:\n",
	" print(\"loop/pytorch = {:.4e}, loop/gpu_pytorch = {:.4e} pytorch/gpu_pytorch = {:.4e}\"\\\n",
	" .format((t1-t0) / (t3-t2), (t1-t0) / (t5-t4), (t3-t2) / (t5-t4) ))\n",
	" else:\n",
	" print(\"pytorch/gpu_pytorch = {:.4e}\".format((t3-t2) / (t5-t4) ))\n",
	" \n",
	" \n",
	"plt.figure()\n",
	"plt.imshow(grid.squeeze())\n",
	"plt.show()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import matplotlib.pyplot as plt\n",
	"import numpy as np\n",
	"import time\n",
	"\n",
	"import torch\n",
	"import torch.nn as nn\n",
	"import torch.nn.functional as F"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Speeding Up Conway's Game of Life with PyTorch\n",
	"\n",
	"Cellular automata are the basic building blocks of a massively parallel computing paradigm capable of high levels of complexity and Turing completeness, even for exceedingly simple rule formulations[[0](https://en.wikipedia.org/wiki/Rule_110)]. The most well-known CA universe is undoubtedly John Horton Conway's Game of Life. CA were the subject of much interest in the 1980s and 1990s as computational capabilities made studying complexity more interesting*. \n",
	"\n",
	"CA are suddenly very interesting again (counter: they never stopped being cool) thanks to projects training differentiable CA as an abstract model of ontogenesis[[1](https://distill.pub/2020/growing-ca/)], self-classifying pixels[[2](https://distill.pub/2020/selforg/mnist/)], and connecting CA to convolutional neural networks[[3](https://arxiv.org/abs/1809.02942)], among others.\n",
	"\n",
	"Even in the era of modern machine learning, typically considered to be synonomous with deep learning, CA have plenty of potential to shake things up. The CA approach will make even more sense as the next generation of computational hardware acceleration trends toward designs closely reminiscent to the mathematical formulations of CA universes (<em>e.g.</em> systolic arrays). Conveniently, we can use the tools of deep learning to build CA universes. By formulating CA universe updates in terms of convolutions and matrix multiplies, we can implement them in libraries like PyTorch or TensorFlow taking advantage of computational speed-ups from GPU support and optimized code. As an extra bonus, continuous-valued and differentiable CA universes become possible, making for an exciting new world of computational playgrounds.\n",
	"\n",
	"This notebook contains code demonstrating how to formulate Conway's Game of Life in terms of convolutional layers implemented in PyTorch as well as a loop-based formulation, and shows just how much faster a CA universe can run in the former case. It's intended as a starting point to begin experimenting with your own flexible CA models. \n",
	"\n",
	"Note that if you are mostly interested in building machines in existing CA universes, I highly recommend you check out [Golly](http://golly.sourceforge.net/). Golly is an open source program for experimenting with many different types of CA including Wolfram's 1D CA, Langton's Ant, and many others. Golly is fast, and it uses memoization algorithms like [HashLife](https://en.wikipedia.org/wiki/Hashlife) to take advantage of temporally repeated patterns for additional speed-up.\n",
	"https://en.wikipedia.org/wiki/Hashlife\n",
	"\n",
	"*Keep in mind that Conways devised Game of Life on a Go board during coffee breaks and Von Neumann designed his 29-state CA on pen and paper. \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# PyTorch and naive implementations of GoL\n",
	"\n",
	"def gol_step(grid, n=1, device=\"cpu\"):\n",
	" \n",
	"\n",
	" if torch.cuda.is_available():\n",
	" device = device\n",
	" else:\n",
	" device = \"cpu\"\n",
	" \n",
	" my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).unsqueeze(0).unsqueeze(0).float().to(device)\n",
	" old_grid = grid.float().to(device)\n",
	" \n",
	" while n > 0:\n",
	" temp_grid = F.conv2d(old_grid, my_kernel, padding=1)#[:,:,1:-1,1:-1]\n",
	" new_grid = torch.zeros_like(old_grid)\n",
	"\n",
	" new_grid[temp_grid == 3] = 1 \n",
	" new_grid[old_grid*temp_grid == 2] = 1\n",
	" \n",
	" old_grid = new_grid.clone()\n",
	"\n",
	" n -= 1\n",
	" #if n > 0:\n",
	" # new_grid = gol_step(new_grid, n=n)\n",
	" \n",
	" return new_grid.to(\"cpu\")\n",
	"\n",
	"\n",
	"def gol_loop(grid, n=1):\n",
	" \n",
	" old_grid = grid.squeeze().int()\n",
	" dim_x, dim_y = old_grid.shape\n",
	" my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).int()\n",
	" \n",
	" while n > 0:\n",
	" \n",
	" new_grid = torch.zeros_like(old_grid)\n",
	" temp_grid = torch.zeros_like(old_grid)\n",
	" for xx in range(dim_x):\n",
	" for yy in range(dim_y):\n",
	" temp_sum = 0\n",
	" \n",
	" y_stop = 3 if yy < (dim_y-1) else -1\n",
	" x_stop = 3 if xx < (dim_x-1) else -1\n",
	" \n",
	" temp_sum = torch.sum(my_kernel[\\\n",
	" 1*(not(xx>0)):x_stop,\\\n",
	" 1(not(yy>0)):y_stop] old_grid[\\\n",
	" max(0, xx-1):min(dim_x, xx+2),\\\n",
	" max(0, yy-1):min(dim_y, yy+2)])\n",
	" \n",
	" temp_grid[xx,yy] = temp_sum\n",
	"\n",
	" new_grid[temp_grid == 3] = 1 \n",
	" new_grid[old_grid*temp_grid == 2] = 1\n",
	" \n",
	" old_grid = new_grid.clone()\n",
	"\n",
	" n -= 1\n",
	" #if n > 0:\n",
	" # new_grid = gol_step(new_grid, n=n)\n",
	" \n",
	" return new_grid\n",
	" \n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# run benchmarks\n",
	"\n",
	"if(0): \n",
	" # draw glider\n",
	" grid = torch.zeros(1,1,2048,2048) #256,256)\n",
	"\n",
	" grid[0,0,19,17] = 1\n",
	" grid[0,0,18,18] = 1\n",
	" grid[0,0,17,18] = 1\n",
	" grid[0,0,17,17] = 1\n",
	" grid[0,0,17,16] = 1\n",
	"\n",
	"grid = 1.0 * (torch.rand(1,1,2048,2048) > 0.50)\n",
	"\n",
	"for num_steps in [1, 6, 60, 600, 6000]:\n",
	" #grid = 1.0 * (torch.rand(1,1,64,64) > 0.50)\n",
	" \n",
	" if num_steps < 601:\n",
	" t0 = time.time() \n",
	" grid = gol_loop(grid, n=num_steps)\n",
	" t1 = time.time()\n",
	" print(\"time for {} gol_loop steps = {:.2e}\".format(num_steps, t1-t0))\n",
	"\n",
	" \n",
	" grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)\n",
	"\n",
	" t2 = time.time() \n",
	" grid = gol_step(grid, n=num_steps)\n",
	" t3 = time.time()\n",
	" print(\"(cpu) time for {} gol steps = {:.2e}\".format(num_steps, t3-t2))\n",
	" \n",
	" if num_steps < 601:\n",
	" print(\"loop/pytorch = {:.4e}\".format((t1-t0) / (t3-t2)))\n",
	" \n",
	" grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)\n",
	"\n",
	" t4 = time.time() \n",
	" grid = gol_step(grid, n=num_steps, device=\"cuda\")\n",
	" t5 = time.time()\n",
	" print(\"(gpu) time for {} gol steps = {:.2e}\".format(num_steps, t5-t4))\n",
	" if num_steps < 601:\n",
	" print(\"loop/pytorch = {:.4e}, loop/gpu_pytorch = {:.4e} pytorch/gpu_pytorch = {:.4e}\"\\\n",
	" .format((t1-t0) / (t3-t2), (t1-t0) / (t5-t4), (t3-t2) / (t5-t4) ))\n",
	" else:\n",
	" print(\"pytorch/gpu_pytorch = {:.4e}\".format((t3-t2) / (t5-t4) ))\n",
	" \n",
	" \n",
	"plt.figure()\n",
	"plt.imshow(grid.squeeze())\n",
	"plt.show()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}
	import numpy as np
	import matplotlib.pyplot as plt
	import numpy as np
	import time

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	# PyTorch and naive implementations of GoL

	def gol_step(grid, n=1, device="cpu"):


	if torch.cuda.is_available():
	device = device
	else:
	device = "cpu"

	my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).unsqueeze(0).unsqueeze(0).float().to(device)
	old_grid = grid.float().to(device)

	while n > 0:
	temp_grid = F.conv2d(old_grid, my_kernel, padding=1)#[:,:,1:-1,1:-1]
	new_grid = torch.zeros_like(old_grid)

	new_grid[temp_grid == 3] = 1
	new_grid[old_grid*temp_grid == 2] = 1

	old_grid = new_grid.clone()

	n -= 1
	#if n > 0:
	# new_grid = gol_step(new_grid, n=n)

	return new_grid.to("cpu")


	def gol_loop(grid, n=1):

	old_grid = grid.squeeze().int()
	dim_x, dim_y = old_grid.shape
	my_kernel = torch.tensor([[1,1,1], [1,0,1], [1,1,1]]).int()

	while n > 0:

	new_grid = torch.zeros_like(old_grid)
	temp_grid = torch.zeros_like(old_grid)
	for xx in range(dim_x):
	for yy in range(dim_y):
	temp_sum = 0

	y_stop = 3 if yy < (dim_y-1) else -1
	x_stop = 3 if xx < (dim_x-1) else -1

	temp_sum = torch.sum(my_kernel[\
	1*(not(xx>0)):x_stop,\
	1(not(yy>0)):y_stop] old_grid[\
	max(0, xx-1):min(dim_x, xx+2),\
	max(0, yy-1):min(dim_y, yy+2)])

	temp_grid[xx,yy] = temp_sum

	new_grid[temp_grid == 3] = 1
	new_grid[old_grid*temp_grid == 2] = 1

	old_grid = new_grid.clone()

	n -= 1
	#if n > 0:
	# new_grid = gol_step(new_grid, n=n)

	return new_grid

	if __name__ == "__main__":
	# run benchmarks

	if(0):
	# draw glider
	grid = torch.zeros(1,1,2048,2048) #256,256)

	grid[0,0,19,17] = 1
	grid[0,0,18,18] = 1
	grid[0,0,17,18] = 1
	grid[0,0,17,17] = 1
	grid[0,0,17,16] = 1

	grid = 1.0 * (torch.rand(1,1,2048,2048) > 0.50)

	for num_steps in [1, 6, 60, 600, 6000]:
	#grid = 1.0 * (torch.rand(1,1,64,64) > 0.50)

	if num_steps < 601:
	t0 = time.time()
	grid = gol_loop(grid, n=num_steps)
	t1 = time.time()
	print("time for {} gol_loop steps = {:.2e}".format(num_steps, t1-t0))


	grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)

	t2 = time.time()
	grid = gol_step(grid, n=num_steps)
	t3 = time.time()
	print("(cpu) time for {} gol steps = {:.2e}".format(num_steps, t3-t2))

	if num_steps < 601:
	print("loop/pytorch = {:.4e}".format((t1-t0) / (t3-t2)))

	grid = 1.0 * (torch.rand(1,1,256,256) > 0.50)

	t4 = time.time()
	grid = gol_step(grid, n=num_steps, device="cuda")
	t5 = time.time()
	print("(gpu) time for {} gol steps = {:.2e}".format(num_steps, t5-t4))
	if num_steps < 601:
	print("loop/pytorch = {:.4e}, loop/gpu_pytorch = {:.4e} pytorch/gpu_pytorch = {:.4e}"\
	.format((t1-t0) / (t3-t2), (t1-t0) / (t5-t4), (t3-t2) / (t5-t4) ))
	else:
	print("pytorch/gpu_pytorch = {:.4e}".format((t3-t2) / (t5-t4) ))