Created
October 10, 2021 00:15
-
-
Save bwasti/29197f4931e99ff431a0433515b58123 to your computer and use it in GitHub Desktop.
loop_tool_lowlevel_sweep.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "loop_tool_lowlevel_sweep.ipynb", | |
"provenance": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/bwasti/29197f4931e99ff431a0433515b58123/loop_tool_lowlevel_sweep.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "DsWTNf6MT4vz", | |
"outputId": "9e6cd04e-f5e3-4775-f783-e414db7a9555" | |
}, | |
"source": [ | |
"!pip install loop_tool_py" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Collecting loop_tool_py\n", | |
" Downloading loop_tool_py-0.0.8-3-cp37-cp37m-manylinux_2_24_x86_64.whl (378 kB)\n", | |
"\u001b[?25l\r\u001b[K |▉ | 10 kB 27.2 MB/s eta 0:00:01\r\u001b[K |█▊ | 20 kB 28.8 MB/s eta 0:00:01\r\u001b[K |██▋ | 30 kB 33.2 MB/s eta 0:00:01\r\u001b[K |███▌ | 40 kB 23.6 MB/s eta 0:00:01\r\u001b[K |████▎ | 51 kB 18.8 MB/s eta 0:00:01\r\u001b[K |█████▏ | 61 kB 18.7 MB/s eta 0:00:01\r\u001b[K |██████ | 71 kB 19.0 MB/s eta 0:00:01\r\u001b[K |███████ | 81 kB 20.0 MB/s eta 0:00:01\r\u001b[K |███████▉ | 92 kB 20.8 MB/s eta 0:00:01\r\u001b[K |████████▋ | 102 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████▌ | 112 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████▍ | 122 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████▎ | 133 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████ | 143 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████ | 153 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████▉ | 163 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████▊ | 174 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████▋ | 184 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████▍ | 194 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████████▎ | 204 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████▏ | 215 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████████ | 225 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████ | 235 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████▊ | 245 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████████████▋ | 256 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████████▌ | 266 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████████████▍ | 276 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 286 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 296 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 307 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████████████▉ | 317 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████████████████▊ | 327 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████████████▌ | 337 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▍ | 348 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▎ | 358 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 368 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 378 kB 21.7 MB/s \n", | |
"\u001b[?25hCollecting pybind11\n", | |
" Downloading pybind11-2.8.0-py2.py3-none-any.whl (207 kB)\n", | |
"\u001b[K |████████████████████████████████| 207 kB 42.2 MB/s \n", | |
"\u001b[?25hInstalling collected packages: pybind11, loop-tool-py\n", | |
"Successfully installed loop-tool-py-0.0.8 pybind11-2.8.0\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 248 | |
}, | |
"id": "zJe5zowCT3c8", | |
"outputId": "ff029bd9-9c1a-47f9-c4e7-e17d52e02b0e" | |
}, | |
"source": [ | |
"import loop_tool_py as lt\n", | |
"import numpy as np\n", | |
"import time\n", | |
"\n", | |
"assert \"cuda\" in lt.backends(), \"this needs to be run on a GPU enabled notebook\"\n", | |
"lt.set_default_hardware(\"cuda\")\n", | |
"\n", | |
"def gen_pw_add():\n", | |
" ir = lt.IR()\n", | |
" a = ir.create_var(\"a\")\n", | |
" r0 = ir.create_node(lt.read, [], [a])\n", | |
" r1 = ir.create_node(lt.read, [], [a])\n", | |
" add = ir.create_node(lt.add, [r0, r1], [a])\n", | |
" w = ir.create_node(lt.write, [add], [a])\n", | |
" ir.set_inputs([r0, r1])\n", | |
" ir.set_outputs([w])\n", | |
" return ir, a\n", | |
"\n", | |
"\n", | |
"def test_pw(size, inner_size, vec_size):\n", | |
" assert size >= (inner_size * vec_size)\n", | |
" ir, v = gen_pw_add() # v = pointwise var\n", | |
" size_map = {}\n", | |
" size_map[v] = size\n", | |
" for n in ir.nodes:\n", | |
" outer = size // (inner_size * vec_size)\n", | |
" outer_rem = size % (inner_size * vec_size)\n", | |
"\n", | |
" ir.set_order(\n", | |
" n, [(v, (outer, outer_rem)), (v, (inner_size, 0)), (v, (vec_size, 0))]\n", | |
" )\n", | |
" ir.disable_reuse(n, 2)\n", | |
" loop_tree = lt.LoopTree(ir)\n", | |
" A = lt.RawTensor(size)\n", | |
" B = lt.RawTensor(size)\n", | |
" C = lt.RawTensor(size)\n", | |
" Ap = np.random.randn(size)\n", | |
" Bp = np.random.randn(size)\n", | |
" A.set(Ap)\n", | |
" B.set(Bp)\n", | |
" C_ref = Ap + Bp\n", | |
" C.set(1337.0)\n", | |
" parallel = set(loop_tree.children(loop_tree.roots[0]))\n", | |
" c = lt.cuda(loop_tree, parallel)\n", | |
" c([A, B, C])\n", | |
" C_test = C.to_numpy()\n", | |
" max_diff = np.max(np.abs(C_test - C_ref))\n", | |
" mean_val = np.mean(np.abs(C_ref))\n", | |
" assert max_diff < 1e-3 * mean_val\n", | |
" iters = 10000\n", | |
" # warmup\n", | |
" for i in range(50):\n", | |
" c([A, B, C])\n", | |
" t = time.time()\n", | |
" for i in range(iters - 1):\n", | |
" c([A, B, C], False)\n", | |
" c([A, B, C])\n", | |
" t_ = time.time()\n", | |
" # 2 read 1 write, 4 bytes per float\n", | |
" bytes_moved = (2 + 1) * 4 * size * iters / (t_ - t) / 1e9\n", | |
" pct = bytes_moved / c.bandwidth\n", | |
" usec = (t_ - t) / iters * 1e6\n", | |
" # print(f\"peak: {c.bandwidth} GB/sec\")\n", | |
" print(\n", | |
" f\"{bytes_moved:.2f} GB/sec\",\n", | |
" f\"({100 * pct:.2f}% of peak, {usec:.2f} usec per iter)\",\n", | |
" )\n", | |
" return (\n", | |
" bytes_moved,\n", | |
" c.code,\n", | |
" loop_tree.dump(lambda x: \"// Threaded\" if x in parallel else \"\"),\n", | |
" )\n", | |
"\n", | |
"\n", | |
"s = 1024 * 1024\n", | |
"best = 0\n", | |
"code = \"\"\n", | |
"loop_tree = \"\"\n", | |
"inner_scale = 512 * 8\n", | |
"for i in range(1, s // inner_scale):\n", | |
" inner = i * inner_scale\n", | |
" for vec_pow in range(0, 3):\n", | |
" vec = 2 ** vec_pow\n", | |
" inner = inner // vec\n", | |
" b, c, l = test_pw(s, inner, vec)\n", | |
" if b > best:\n", | |
" best = b\n", | |
" code = c\n", | |
" loop_tree = l\n", | |
"print(f\"Best kernel found ({best:.2f} GB/sec):\")\n", | |
"print(loop_tree)\n", | |
"print(code)" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "error", | |
"ename": "AssertionError", | |
"evalue": "ignored", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", | |
"\u001b[0;32m<ipython-input-2-6393985f1619>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0;34m\"cuda\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackends\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"this needs to be run on a GPU enabled notebook\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mlt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_default_hardware\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"cuda\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
"\u001b[0;31mAssertionError\u001b[0m: this needs to be run on a GPU enabled notebook" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "EnFO-MYYUZmA" | |
}, | |
"source": [ | |
"" | |
], | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment