Last active
December 9, 2022 14:22
-
-
Save PiotrCzapla/90c446929d2d699a8fdce65bc8a95f38 to your computer and use it in GitHub Desktop.
Clean up a memory leak after unhandled exception in jupyter notebooks.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "65ec7d4d-f3e0-4d08-8258-02d70e79befc", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"!pip install -Uqq fastai fastcore kaggle\n", | |
"!pip install -Uqq 'timm>=0.6.2.dev0' pynvml \n", | |
"!pip install -Uq objgraph\n", | |
"!pip install -Uq fastkaggle" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"id": "a67420a1", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"interactive=False" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "c0539d9a-0007-4073-908e-88b0163dbaf8", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from fastkaggle import *" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "52c62e9f-81e2-4212-9e75-bc9666586040", | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Fri Dec 9 13:47:56 2022 \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 515.65.01 Driver Version: 515.65.01 CUDA Version: 11.7 |\n", | |
"|-------------------------------+----------------------+----------------------+\n", | |
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", | |
"| | | MIG M. |\n", | |
"|===============================+======================+======================|\n", | |
"| 0 NVIDIA GeForce ... Off | 00000000:06:00.0 Off | N/A |\n", | |
"| 18% 41C P0 43W / 250W | 0MiB / 11264MiB | 0% Default |\n", | |
"| | | N/A |\n", | |
"+-------------------------------+----------------------+----------------------+\n", | |
" \n", | |
"+-----------------------------------------------------------------------------+\n", | |
"| Processes: |\n", | |
"| GPU GI CI PID Type Process name GPU Memory |\n", | |
"| ID ID Usage |\n", | |
"|=============================================================================|\n", | |
"| No running processes found |\n", | |
"+-----------------------------------------------------------------------------+\n" | |
] | |
} | |
], | |
"source": [ | |
"!nvidia-smi" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "473d8626-0d50-4acd-bada-25dcd2c6ba2d", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import kaggle\n", | |
"import timm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "00ebb94d-c325-4772-814a-6deb01a30626", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"comp = 'paddy-disease-classification'\n", | |
"path = setup_comp(comp, install='fastai \"timm>=0.6.2.dev0\"')\n", | |
"from fastai.vision.all import *\n", | |
"set_seed(42)\n", | |
"\n", | |
"tst_files = get_image_files(path/'test_images').sorted()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"id": "56b32cf8-82e1-4214-b9ee-abf0f912d0ba", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"trn_path = path/'train_images_min'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"id": "79d01af4", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"for v in (path/'train_images').glob('*'):\n", | |
" dst = trn_path.absolute()/v.name\n", | |
" dst.mkdir(exist_ok=True, parents=True)\n", | |
" for p in list(sorted(v.glob('*')))[:20]:\n", | |
" f = dst/p.name\n", | |
" if not f.exists(): \n", | |
" f.symlink_to(p.absolute())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "e39a463d-ead3-4698-b5c4-8975d867c7cd", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def broken_error_rate(inp, targ, axis=-1):\n", | |
" return 1/0\n", | |
"\n", | |
"def train(arch, accum=1, raise_exception=False, cbs=[]):\n", | |
" epochs=1; bs=96; size=192; val_seed=42\n", | |
" dls = ImageDataLoaders.from_folder(trn_path, valid_pct=0.3, item_tfms=Resize(480, method='squish'),\n", | |
" splitter=RandomSplitter(0.3, seed=val_seed),\n", | |
" batch_tfms=aug_transforms(size=size, min_scale=0.75), bs=bs//accum)\n", | |
" cbs = cbs + [GradientAccumulation(bs)]\n", | |
" metrics = broken_error_rate if raise_exception else error_rate\n", | |
" learn = vision_learner(dls, arch, metrics=metrics, cbs=cbs).to_fp16()\n", | |
" lr = 0.01\n", | |
" learn.unfreeze()\n", | |
" learn.fit_one_cycle(epochs, lr)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "06857628", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Counter()\n", | |
"Reserved mem in use: 0 MB\n" | |
] | |
} | |
], | |
"source": [ | |
"import sys,gc\n", | |
"import objgraph\n", | |
"def print_tensor_count(message=\"\"):\n", | |
" gc.collect()\n", | |
" torch.cuda.empty_cache()\n", | |
" if message:\n", | |
" print(message)\n", | |
" print(collections.Counter(v.device.type if v.numel() else 'empty' for v in objgraph.by_type('torch.Tensor')))\n", | |
" print(\"Reserved mem in use:\", torch.cuda.memory_stats().get('reserved_bytes.all.current',0)//1024**2, \"MB\")\n", | |
"print_tensor_count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "8664bf5b", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def train_safe(arch='convnext_large', accum=4,**kwargs): \n", | |
" def execute():\n", | |
" print(f'Using {arch}, with accum={accum}')\n", | |
" try: \n", | |
" train(arch, accum=accum, **kwargs)\n", | |
" return True\n", | |
" except Exception as e:\n", | |
" print('Exception', e)\n", | |
" if not execute():\n", | |
" gc.collect()\n", | |
" torch.cuda.empty_cache()\n", | |
" gc.collect()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "7826a643", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"to ensure we don't have any tensors yet\n", | |
"Counter()\n", | |
"Reserved mem in use: 0 MB\n" | |
] | |
} | |
], | |
"source": [ | |
"print_tensor_count(\"to ensure we don't have any tensors yet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "1fc4c2ff", | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Using convnext_large, with accum=4\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<style>\n", | |
" /* Turns off some styling */\n", | |
" progress {\n", | |
" /* gets rid of default border in Firefox and Opera. */\n", | |
" border: none;\n", | |
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n", | |
" background-size: auto;\n", | |
" }\n", | |
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", | |
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", | |
" }\n", | |
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", | |
" background: #F44336;\n", | |
" }\n", | |
"</style>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: left;\">\n", | |
" <th>epoch</th>\n", | |
" <th>train_loss</th>\n", | |
" <th>valid_loss</th>\n", | |
" <th>error_rate</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>3.980382</td>\n", | |
" <td>11.090388</td>\n", | |
" <td>0.900000</td>\n", | |
" <td>00:02</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"train_safe() # this should pass" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "d590a5cb", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Do we have any tensors hanging around?\n", | |
"Counter({'cuda': 9})\n", | |
"Reserved mem in use: 4 MB\n" | |
] | |
} | |
], | |
"source": [ | |
"print_tensor_count(\"Do we have any tensors hanging around?\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "d0a71a8e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Using convnext_large, with accum=4\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<style>\n", | |
" /* Turns off some styling */\n", | |
" progress {\n", | |
" /* gets rid of default border in Firefox and Opera. */\n", | |
" border: none;\n", | |
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n", | |
" background-size: auto;\n", | |
" }\n", | |
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", | |
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", | |
" }\n", | |
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", | |
" background: #F44336;\n", | |
" }\n", | |
"</style>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: left;\">\n", | |
" <th>epoch</th>\n", | |
" <th>train_loss</th>\n", | |
" <th>valid_loss</th>\n", | |
" <th>error_rate</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>3.679147</td>\n", | |
" <td>3.564103</td>\n", | |
" <td>0.933333</td>\n", | |
" <td>00:02</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"train_safe() # and after another pass on large model" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "eebde136", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Is it still 7 after another pass?\n", | |
"Counter({'cuda': 9})\n", | |
"Reserved mem in use: 6 MB\n" | |
] | |
} | |
], | |
"source": [ | |
"print_tensor_count(\"Is it still 7 after another pass?\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "887b66fb", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# lets raise some exceptions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "a83908ff", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Using convnext_large, with accum=4\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<style>\n", | |
" /* Turns off some styling */\n", | |
" progress {\n", | |
" /* gets rid of default border in Firefox and Opera. */\n", | |
" border: none;\n", | |
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n", | |
" background-size: auto;\n", | |
" }\n", | |
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", | |
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", | |
" }\n", | |
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", | |
" background: #F44336;\n", | |
" }\n", | |
"</style>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
" <div>\n", | |
" <progress value='0' class='' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", | |
" 0.00% [0/1 00:00<?]\n", | |
" </div>\n", | |
" \n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: left;\">\n", | |
" <th>epoch</th>\n", | |
" <th>train_loss</th>\n", | |
" <th>valid_loss</th>\n", | |
" <th>broken_error_rate</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" </tbody>\n", | |
"</table><p>\n", | |
"\n", | |
" <div>\n", | |
" <progress value='0' class='' max='3' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", | |
" 0.00% [0/3 00:00<?]\n", | |
" </div>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Exception Exception occured in `Recorder` when calling event `after_batch`:\n", | |
"\tdivision by zero\n" | |
] | |
} | |
], | |
"source": [ | |
"train_safe(raise_exception=True) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"id": "c5ef5e48", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Is it still 7 after another pass?\n", | |
"Counter({'cuda': 9})\n", | |
"Reserved mem in use: 4 MB\n" | |
] | |
} | |
], | |
"source": [ | |
"print_tensor_count(\"Is it still 7 after another pass?\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"id": "75d51d6d", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"806" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sum([v.numel() for v in objgraph.by_type('torch.Tensor')]) # are they still small" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "e671584c", | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Using convnext_large, with accum=4\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<style>\n", | |
" /* Turns off some styling */\n", | |
" progress {\n", | |
" /* gets rid of default border in Firefox and Opera. */\n", | |
" border: none;\n", | |
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n", | |
" background-size: auto;\n", | |
" }\n", | |
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", | |
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", | |
" }\n", | |
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", | |
" background: #F44336;\n", | |
" }\n", | |
"</style>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: left;\">\n", | |
" <th>epoch</th>\n", | |
" <th>train_loss</th>\n", | |
" <th>valid_loss</th>\n", | |
" <th>error_rate</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>3.905663</td>\n", | |
" <td>3.984396</td>\n", | |
" <td>0.850000</td>\n", | |
" <td>00:02</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"train_safe() # let see if we can run a model that bearly fits in our memory" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "7c2bcd40", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# And here is what happens if we don't handle exceptions " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "a9653192", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Is it still 7 after another pass?\n", | |
"Counter({'cuda': 9})\n", | |
"Reserved mem in use: 6 MB\n", | |
"|===========================================================================|\n", | |
"| PyTorch CUDA memory summary, device ID 0 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| CUDA OOMs: 0 | cudaMalloc retries: 0 |\n", | |
"|===========================================================================|\n", | |
"| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Allocated memory | 3584 B | 6745 MB | 414539 MB | 414539 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Active memory | 3584 B | 6745 MB | 414539 MB | 414539 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| GPU reserved memory | 6144 KB | 7338 MB | 26026 MB | 26020 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Non-releasable memory | 6140 KB | 545942 KB | 174387 MB | 174381 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Allocations | 7 | 1957 | 59399 | 59392 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Active allocs | 7 | 1957 | 59399 | 59392 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| GPU reserved segments | 3 | 349 | 1203 | 1200 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Non-releasable allocs | 8 | 289 | 38714 | 38706 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Oversize allocations | 0 | 0 | 0 | 0 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Oversize GPU segments | 0 | 0 | 0 | 0 |\n", | |
"|===========================================================================|\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print_tensor_count(\"Is it still 7 after another pass?\")\n", | |
"print(torch.cuda.memory_summary(abbreviated=True))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "1dc9fe4c", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from contextlib import contextmanager\n", | |
"\n", | |
"@contextmanager\n", | |
"def nbdev_allow_exception(*args, **kwds):\n", | |
" try:\n", | |
" yield None\n", | |
" except Exception as e:\n", | |
" if interactive: raise\n", | |
" else:\n", | |
" print(\"Simulating unhandled exception!\")\n", | |
" sys.last_value=e\n", | |
" sys.last_traceback=e.__traceback__\n", | |
" sys.last_type=type(e)\n", | |
" sys.excepthook(sys.last_type, e, e.__traceback__)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "44a4770e", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/pczapla/.local/share/miniconda/lib/python3.9/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.\n", | |
" warnings.warn(\n", | |
"/home/pczapla/.local/share/miniconda/lib/python3.9/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ConvNeXt_Large_Weights.IMAGENET1K_V1`. You can also use `weights=ConvNeXt_Large_Weights.DEFAULT` to get the most up-to-date weights.\n", | |
" warnings.warn(msg)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<style>\n", | |
" /* Turns off some styling */\n", | |
" progress {\n", | |
" /* gets rid of default border in Firefox and Opera. */\n", | |
" border: none;\n", | |
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n", | |
" background-size: auto;\n", | |
" }\n", | |
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", | |
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", | |
" }\n", | |
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", | |
" background: #F44336;\n", | |
" }\n", | |
"</style>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
" <div>\n", | |
" <progress value='0' class='' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", | |
" 0.00% [0/1 00:00<?]\n", | |
" </div>\n", | |
" \n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: left;\">\n", | |
" <th>epoch</th>\n", | |
" <th>train_loss</th>\n", | |
" <th>valid_loss</th>\n", | |
" <th>broken_error_rate</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" </tbody>\n", | |
"</table><p>\n", | |
"\n", | |
" <div>\n", | |
" <progress value='0' class='' max='3' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", | |
" 0.00% [0/3 00:00<?]\n", | |
" </div>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Simulating unhandled exception!\n" | |
] | |
}, | |
{ | |
"ename": "ZeroDivisionError", | |
"evalue": "Exception occured in `Recorder` when calling event `after_batch`:\n\tdivision by zero", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mZeroDivisionError\u001b[0m Traceback (most recent call last)", | |
"Cell \u001b[0;32mIn [24], line 6\u001b[0m, in \u001b[0;36mnbdev_allow_exception\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;129m@contextmanager\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mnbdev_allow_exception\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m interactive: \u001b[38;5;28;01mraise\u001b[39;00m\n", | |
"Cell \u001b[0;32mIn [25], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m nbdev_allow_exception(): train(convnext_large, accum\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m, raise_exception\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", | |
"Cell \u001b[0;32mIn [9], line 14\u001b[0m, in \u001b[0;36mtrain\u001b[0;34m(arch, accum, raise_exception, cbs)\u001b[0m\n\u001b[1;32m 12\u001b[0m lr \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.01\u001b[39m\n\u001b[1;32m 13\u001b[0m learn\u001b[38;5;241m.\u001b[39munfreeze()\n\u001b[0;32m---> 14\u001b[0m \u001b[43mlearn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_one_cycle\u001b[49m\u001b[43m(\u001b[49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/callback/schedule.py:119\u001b[0m, in \u001b[0;36mfit_one_cycle\u001b[0;34m(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)\u001b[0m\n\u001b[1;32m 116\u001b[0m lr_max \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray([h[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m h \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopt\u001b[38;5;241m.\u001b[39mhypers])\n\u001b[1;32m 117\u001b[0m scheds \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m'\u001b[39m: combined_cos(pct_start, lr_max\u001b[38;5;241m/\u001b[39mdiv, lr_max, lr_max\u001b[38;5;241m/\u001b[39mdiv_final),\n\u001b[1;32m 118\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmom\u001b[39m\u001b[38;5;124m'\u001b[39m: combined_cos(pct_start, \u001b[38;5;241m*\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmoms \u001b[38;5;28;01mif\u001b[39;00m moms \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m moms))}\n\u001b[0;32m--> 119\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_epoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcbs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mParamScheduler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mscheds\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43mL\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcbs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreset_opt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreset_opt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwd\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstart_epoch\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:256\u001b[0m, in \u001b[0;36mLearner.fit\u001b[0;34m(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopt\u001b[38;5;241m.\u001b[39mset_hypers(lr\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlr \u001b[38;5;28;01mif\u001b[39;00m lr \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m lr)\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_epoch \u001b[38;5;241m=\u001b[39m n_epoch\n\u001b[0;32m--> 256\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_with_events\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_fit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mCancelFitException\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_end_cleanup\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:193\u001b[0m, in \u001b[0;36mLearner._with_events\u001b[0;34m(self, f, event_type, ex, final)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_with_events\u001b[39m(\u001b[38;5;28mself\u001b[39m, f, event_type, ex, final\u001b[38;5;241m=\u001b[39mnoop):\n\u001b[0;32m--> 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbefore_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ex: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_cancel_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); final()\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:245\u001b[0m, in \u001b[0;36mLearner._do_fit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m epoch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_epoch):\n\u001b[1;32m 244\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mepoch\u001b[38;5;241m=\u001b[39mepoch\n\u001b[0;32m--> 245\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_with_events\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_epoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mepoch\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mCancelEpochException\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:193\u001b[0m, in \u001b[0;36mLearner._with_events\u001b[0;34m(self, f, event_type, ex, final)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_with_events\u001b[39m(\u001b[38;5;28mself\u001b[39m, f, event_type, ex, final\u001b[38;5;241m=\u001b[39mnoop):\n\u001b[0;32m--> 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbefore_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ex: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_cancel_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); final()\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:240\u001b[0m, in \u001b[0;36mLearner._do_epoch\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_do_epoch\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 239\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_do_epoch_train()\n\u001b[0;32m--> 240\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_epoch_validate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:236\u001b[0m, in \u001b[0;36mLearner._do_epoch_validate\u001b[0;34m(self, ds_idx, dl)\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m: dl \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdls[ds_idx]\n\u001b[1;32m 235\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdl \u001b[38;5;241m=\u001b[39m dl\n\u001b[0;32m--> 236\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad(): \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_with_events\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mall_batches\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mvalidate\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mCancelValidException\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:193\u001b[0m, in \u001b[0;36mLearner._with_events\u001b[0;34m(self, f, event_type, ex, final)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_with_events\u001b[39m(\u001b[38;5;28mself\u001b[39m, f, event_type, ex, final\u001b[38;5;241m=\u001b[39mnoop):\n\u001b[0;32m--> 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbefore_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ex: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_cancel_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); final()\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:199\u001b[0m, in \u001b[0;36mLearner.all_batches\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mall_batches\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_iter \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdl)\n\u001b[0;32m--> 199\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m o \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdl): \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mone_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mo\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:227\u001b[0m, in \u001b[0;36mLearner.one_batch\u001b[0;34m(self, i, b)\u001b[0m\n\u001b[1;32m 225\u001b[0m b \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_device(b)\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_split(b)\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_with_events\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_one_batch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbatch\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mCancelBatchException\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:195\u001b[0m, in \u001b[0;36mLearner._with_events\u001b[0;34m(self, f, event_type, ex, final)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbefore_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); f()\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ex: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_cancel_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m--> 195\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mafter_\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mevent_type\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m; final()\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:171\u001b[0m, in \u001b[0;36mLearner.__call__\u001b[0;34m(self, event_name)\u001b[0m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, event_name): \u001b[43mL\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent_name\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_one\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/.local/share/miniconda/lib/python3.9/site-packages/fastcore/foundation.py:156\u001b[0m, in \u001b[0;36mL.map\u001b[0;34m(self, f, gen, *args, **kwargs)\u001b[0m\n\u001b[0;32m--> 156\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mmap\u001b[39m(\u001b[38;5;28mself\u001b[39m, f, \u001b[38;5;241m*\u001b[39margs, gen\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs): \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_new(\u001b[43mmap_ex\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n", | |
"File \u001b[0;32m~/.local/share/miniconda/lib/python3.9/site-packages/fastcore/basics.py:840\u001b[0m, in \u001b[0;36mmap_ex\u001b[0;34m(iterable, f, gen, *args, **kwargs)\u001b[0m\n\u001b[1;32m 838\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmap\u001b[39m(g, iterable)\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gen: \u001b[38;5;28;01mreturn\u001b[39;00m res\n\u001b[0;32m--> 840\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mres\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/.local/share/miniconda/lib/python3.9/site-packages/fastcore/basics.py:825\u001b[0m, in \u001b[0;36mbind.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 823\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v,_Arg): kwargs[k] \u001b[38;5;241m=\u001b[39m args\u001b[38;5;241m.\u001b[39mpop(v\u001b[38;5;241m.\u001b[39mi)\n\u001b[1;32m 824\u001b[0m fargs \u001b[38;5;241m=\u001b[39m [args[x\u001b[38;5;241m.\u001b[39mi] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(x, _Arg) \u001b[38;5;28;01melse\u001b[39;00m x \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpargs] \u001b[38;5;241m+\u001b[39m args[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmaxi\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m:]\n\u001b[0;32m--> 825\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:175\u001b[0m, in \u001b[0;36mLearner._call_one\u001b[0;34m(self, event_name)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_call_one\u001b[39m(\u001b[38;5;28mself\u001b[39m, event_name):\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(event, event_name): \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmissing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m cb \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcbs\u001b[38;5;241m.\u001b[39msorted(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124morder\u001b[39m\u001b[38;5;124m'\u001b[39m): \u001b[43mcb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent_name\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/callback/core.py:62\u001b[0m, in \u001b[0;36mCallback.__call__\u001b[0;34m(self, event_name)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: res \u001b[38;5;241m=\u001b[39m getcallable(\u001b[38;5;28mself\u001b[39m, event_name)()\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (CancelBatchException, CancelBackwardException, CancelEpochException, CancelFitException, CancelStepException, CancelTrainException, CancelValidException): \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m---> 62\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e: \u001b[38;5;28;01mraise\u001b[39;00m modify_exception(e, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mException occured in `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` when calling event `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39margs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m, replace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m event_name\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_fit\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m#Reset self.run to True at each end of fit\u001b[39;00m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/callback/core.py:60\u001b[0m, in \u001b[0;36mCallback.__call__\u001b[0;34m(self, event_name)\u001b[0m\n\u001b[1;32m 58\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun \u001b[38;5;129;01mand\u001b[39;00m _run: \n\u001b[0;32m---> 60\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: res \u001b[38;5;241m=\u001b[39m \u001b[43mgetcallable\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent_name\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (CancelBatchException, CancelBackwardException, CancelEpochException, CancelFitException, CancelStepException, CancelTrainException, CancelValidException): \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e: \u001b[38;5;28;01mraise\u001b[39;00m modify_exception(e, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mException occured in `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` when calling event `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39margs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m, replace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:551\u001b[0m, in \u001b[0;36mRecorder.after_batch\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39myb) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m: \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 550\u001b[0m mets \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_mets \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_valid_mets\n\u001b[0;32m--> 551\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m met \u001b[38;5;129;01min\u001b[39;00m mets: \u001b[43mmet\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maccumulate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlearn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 552\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining: \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 553\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlrs\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopt\u001b[38;5;241m.\u001b[39mhypers[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m'\u001b[39m])\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:473\u001b[0m, in \u001b[0;36mAvgMetric.accumulate\u001b[0;34m(self, learn)\u001b[0m\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21maccumulate\u001b[39m(\u001b[38;5;28mself\u001b[39m, learn):\n\u001b[1;32m 472\u001b[0m bs \u001b[38;5;241m=\u001b[39m find_bs(learn\u001b[38;5;241m.\u001b[39myb)\n\u001b[0;32m--> 473\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtotal \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m learn\u001b[38;5;241m.\u001b[39mto_detach(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlearn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mlearn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43myb\u001b[49m\u001b[43m)\u001b[49m)\u001b[38;5;241m*\u001b[39mbs\n\u001b[1;32m 474\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcount \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m bs\n", | |
"Cell \u001b[0;32mIn [9], line 2\u001b[0m, in \u001b[0;36mbroken_error_rate\u001b[0;34m(inp, targ, axis)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mbroken_error_rate\u001b[39m(inp, targ, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m):\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241;43m1\u001b[39;49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\n", | |
"\u001b[0;31mZeroDivisionError\u001b[0m: Exception occured in `Recorder` when calling event `after_batch`:\n\tdivision by zero" | |
] | |
} | |
], | |
"source": [ | |
"with nbdev_allow_exception(): train(convnext_large, accum=4, raise_exception=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "2ad597f5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"This time it won't be just 7 tensors left alive:\n", | |
"Counter({'cuda': 370, 'cpu': 10})\n", | |
"Reserved mem in use: 1832 MB\n", | |
"|===========================================================================|\n", | |
"| PyTorch CUDA memory summary, device ID 0 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| CUDA OOMs: 0 | cudaMalloc retries: 0 |\n", | |
"|===========================================================================|\n", | |
"| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Allocated memory | 1525 MB | 6745 MB | 510936 MB | 509411 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Active memory | 1525 MB | 6745 MB | 510936 MB | 509411 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| GPU reserved memory | 1832 MB | 7338 MB | 31738 MB | 29906 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Non-releasable memory | 313710 KB | 563634 KB | 215766 MB | 215460 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Allocations | 712 | 1957 | 72524 | 71812 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Active allocs | 712 | 1957 | 72524 | 71812 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| GPU reserved segments | 111 | 349 | 1469 | 1358 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Non-releasable allocs | 124 | 289 | 47634 | 47510 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Oversize allocations | 0 | 0 | 0 | 0 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Oversize GPU segments | 0 | 0 | 0 | 0 |\n", | |
"|===========================================================================|\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print_tensor_count(\"This time it won't be just 7 tensors left alive:\")\n", | |
"print(torch.cuda.memory_summary(abbreviated=True))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"id": "b01524d2", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# let's try our clean up function\n", | |
"\n", | |
"sys.last_traceback.tb_next = None\n", | |
"gc.collect()\n", | |
"torch.cuda.empty_cache()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"id": "09e866cc", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Are we back to low number?\n", | |
"Counter({'cuda': 9})\n", | |
"Reserved mem in use: 10 MB\n", | |
"|===========================================================================|\n", | |
"| PyTorch CUDA memory summary, device ID 0 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| CUDA OOMs: 0 | cudaMalloc retries: 0 |\n", | |
"|===========================================================================|\n", | |
"| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Allocated memory | 4608 B | 6745 MB | 510936 MB | 510936 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Active memory | 4608 B | 6745 MB | 510936 MB | 510936 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| GPU reserved memory | 10240 KB | 7338 MB | 31738 MB | 31728 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Non-releasable memory | 10235 KB | 563634 KB | 216108 MB | 216098 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Allocations | 7 | 1957 | 72524 | 72517 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Active allocs | 7 | 1957 | 72524 | 72517 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| GPU reserved segments | 5 | 349 | 1469 | 1464 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Non-releasable allocs | 10 | 289 | 47742 | 47732 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Oversize allocations | 0 | 0 | 0 | 0 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Oversize GPU segments | 0 | 0 | 0 | 0 |\n", | |
"|===========================================================================|\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print_tensor_count(\"Are we back to low number?\")\n", | |
"print(torch.cuda.memory_summary(abbreviated=True))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"id": "b250a522", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# ok so this seems useful, let's see if we can make a callback out of it" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"id": "e183e632", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sys\n", | |
"class CleanUpUnhandledException(Callback):\n", | |
" \"Clean up traceback stored in sys.last_traceback freeing memory\"\n", | |
" order=MixedPrecision.order\n", | |
" def _mem(self): \n", | |
" return torch.cuda.memory_stats().get('reserved_bytes.all.current',0)//1024**2\n", | |
" def before_fit(self):\n", | |
" if hasattr(sys, 'last_traceback'):\n", | |
" print(\"sys.last_traceback detected recovering.\")\n", | |
" pre=self._mem()\n", | |
" sys.last_traceback.tb_next = None\n", | |
" gc.collect()\n", | |
" if torch.cuda.is_available(): \n", | |
" torch.cuda.empty_cache()\n", | |
" print(\"Recovered:\", pre - self._mem(), \"MB\")\n", | |
" del sys.last_traceback\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"id": "11dac031", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<style>\n", | |
" /* Turns off some styling */\n", | |
" progress {\n", | |
" /* gets rid of default border in Firefox and Opera. */\n", | |
" border: none;\n", | |
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n", | |
" background-size: auto;\n", | |
" }\n", | |
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", | |
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", | |
" }\n", | |
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", | |
" background: #F44336;\n", | |
" }\n", | |
"</style>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
" <div>\n", | |
" <progress value='0' class='' max='1' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", | |
" 0.00% [0/1 00:00<?]\n", | |
" </div>\n", | |
" \n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: left;\">\n", | |
" <th>epoch</th>\n", | |
" <th>train_loss</th>\n", | |
" <th>valid_loss</th>\n", | |
" <th>broken_error_rate</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" </tbody>\n", | |
"</table><p>\n", | |
"\n", | |
" <div>\n", | |
" <progress value='0' class='' max='3' style='width:300px; height:20px; vertical-align: middle;'></progress>\n", | |
" 0.00% [0/3 00:00<?]\n", | |
" </div>\n", | |
" " | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Simulating unhandled exception!\n" | |
] | |
}, | |
{ | |
"ename": "ZeroDivisionError", | |
"evalue": "Exception occured in `Recorder` when calling event `after_batch`:\n\tdivision by zero", | |
"output_type": "error", | |
"traceback": [ | |
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
"\u001b[0;31mZeroDivisionError\u001b[0m Traceback (most recent call last)", | |
"Cell \u001b[0;32mIn [24], line 6\u001b[0m, in \u001b[0;36mnbdev_allow_exception\u001b[0;34m(*args, **kwds)\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;129m@contextmanager\u001b[39m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mnbdev_allow_exception\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwds):\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m----> 6\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 8\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m interactive: \u001b[38;5;28;01mraise\u001b[39;00m\n", | |
"Cell \u001b[0;32mIn [31], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m nbdev_allow_exception(): train(convnext_large, accum\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m, raise_exception\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", | |
"Cell \u001b[0;32mIn [9], line 14\u001b[0m, in \u001b[0;36mtrain\u001b[0;34m(arch, accum, raise_exception, cbs)\u001b[0m\n\u001b[1;32m 12\u001b[0m lr \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.01\u001b[39m\n\u001b[1;32m 13\u001b[0m learn\u001b[38;5;241m.\u001b[39munfreeze()\n\u001b[0;32m---> 14\u001b[0m \u001b[43mlearn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit_one_cycle\u001b[49m\u001b[43m(\u001b[49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/callback/schedule.py:119\u001b[0m, in \u001b[0;36mfit_one_cycle\u001b[0;34m(self, n_epoch, lr_max, div, div_final, pct_start, wd, moms, cbs, reset_opt, start_epoch)\u001b[0m\n\u001b[1;32m 116\u001b[0m lr_max \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39marray([h[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;28;01mfor\u001b[39;00m h \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopt\u001b[38;5;241m.\u001b[39mhypers])\n\u001b[1;32m 117\u001b[0m scheds \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m'\u001b[39m: combined_cos(pct_start, lr_max\u001b[38;5;241m/\u001b[39mdiv, lr_max, lr_max\u001b[38;5;241m/\u001b[39mdiv_final),\n\u001b[1;32m 118\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmom\u001b[39m\u001b[38;5;124m'\u001b[39m: combined_cos(pct_start, \u001b[38;5;241m*\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmoms \u001b[38;5;28;01mif\u001b[39;00m moms \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m moms))}\n\u001b[0;32m--> 119\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_epoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcbs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mParamScheduler\u001b[49m\u001b[43m(\u001b[49m\u001b[43mscheds\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43mL\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcbs\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreset_opt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreset_opt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwd\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwd\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstart_epoch\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:256\u001b[0m, in \u001b[0;36mLearner.fit\u001b[0;34m(self, n_epoch, lr, wd, cbs, reset_opt, start_epoch)\u001b[0m\n\u001b[1;32m 254\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopt\u001b[38;5;241m.\u001b[39mset_hypers(lr\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlr \u001b[38;5;28;01mif\u001b[39;00m lr \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m lr)\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_epoch \u001b[38;5;241m=\u001b[39m n_epoch\n\u001b[0;32m--> 256\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_with_events\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_fit\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfit\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mCancelFitException\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_end_cleanup\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:193\u001b[0m, in \u001b[0;36mLearner._with_events\u001b[0;34m(self, f, event_type, ex, final)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_with_events\u001b[39m(\u001b[38;5;28mself\u001b[39m, f, event_type, ex, final\u001b[38;5;241m=\u001b[39mnoop):\n\u001b[0;32m--> 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbefore_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ex: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_cancel_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); final()\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:245\u001b[0m, in \u001b[0;36mLearner._do_fit\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m epoch \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mrange\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_epoch):\n\u001b[1;32m 244\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mepoch\u001b[38;5;241m=\u001b[39mepoch\n\u001b[0;32m--> 245\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_with_events\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_epoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mepoch\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mCancelEpochException\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:193\u001b[0m, in \u001b[0;36mLearner._with_events\u001b[0;34m(self, f, event_type, ex, final)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_with_events\u001b[39m(\u001b[38;5;28mself\u001b[39m, f, event_type, ex, final\u001b[38;5;241m=\u001b[39mnoop):\n\u001b[0;32m--> 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbefore_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ex: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_cancel_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); final()\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:240\u001b[0m, in \u001b[0;36mLearner._do_epoch\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 238\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_do_epoch\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 239\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_do_epoch_train()\n\u001b[0;32m--> 240\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_epoch_validate\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:236\u001b[0m, in \u001b[0;36mLearner._do_epoch_validate\u001b[0;34m(self, ds_idx, dl)\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dl \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m: dl \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdls[ds_idx]\n\u001b[1;32m 235\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdl \u001b[38;5;241m=\u001b[39m dl\n\u001b[0;32m--> 236\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mno_grad(): \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_with_events\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mall_batches\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mvalidate\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mCancelValidException\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:193\u001b[0m, in \u001b[0;36mLearner._with_events\u001b[0;34m(self, f, event_type, ex, final)\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_with_events\u001b[39m(\u001b[38;5;28mself\u001b[39m, f, event_type, ex, final\u001b[38;5;241m=\u001b[39mnoop):\n\u001b[0;32m--> 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbefore_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); \u001b[43mf\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ex: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_cancel_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 195\u001b[0m \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); final()\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:199\u001b[0m, in \u001b[0;36mLearner.all_batches\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mall_batches\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_iter \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdl)\n\u001b[0;32m--> 199\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m o \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdl): \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mone_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mo\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:227\u001b[0m, in \u001b[0;36mLearner.one_batch\u001b[0;34m(self, i, b)\u001b[0m\n\u001b[1;32m 225\u001b[0m b \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_device(b)\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_split(b)\n\u001b[0;32m--> 227\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_with_events\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_one_batch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mbatch\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mCancelBatchException\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:195\u001b[0m, in \u001b[0;36mLearner._with_events\u001b[0;34m(self, f, event_type, ex, final)\u001b[0m\n\u001b[1;32m 193\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbefore_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m); f()\n\u001b[1;32m 194\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ex: \u001b[38;5;28mself\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_cancel_\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_type\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m--> 195\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mafter_\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mevent_type\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m; final()\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:171\u001b[0m, in \u001b[0;36mLearner.__call__\u001b[0;34m(self, event_name)\u001b[0m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__call__\u001b[39m(\u001b[38;5;28mself\u001b[39m, event_name): \u001b[43mL\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent_name\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_one\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/.local/share/miniconda/lib/python3.9/site-packages/fastcore/foundation.py:156\u001b[0m, in \u001b[0;36mL.map\u001b[0;34m(self, f, gen, *args, **kwargs)\u001b[0m\n\u001b[0;32m--> 156\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mmap\u001b[39m(\u001b[38;5;28mself\u001b[39m, f, \u001b[38;5;241m*\u001b[39margs, gen\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs): \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_new(\u001b[43mmap_ex\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m)\n", | |
"File \u001b[0;32m~/.local/share/miniconda/lib/python3.9/site-packages/fastcore/basics.py:840\u001b[0m, in \u001b[0;36mmap_ex\u001b[0;34m(iterable, f, gen, *args, **kwargs)\u001b[0m\n\u001b[1;32m 838\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmap\u001b[39m(g, iterable)\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m gen: \u001b[38;5;28;01mreturn\u001b[39;00m res\n\u001b[0;32m--> 840\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mlist\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mres\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/.local/share/miniconda/lib/python3.9/site-packages/fastcore/basics.py:825\u001b[0m, in \u001b[0;36mbind.__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 823\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(v,_Arg): kwargs[k] \u001b[38;5;241m=\u001b[39m args\u001b[38;5;241m.\u001b[39mpop(v\u001b[38;5;241m.\u001b[39mi)\n\u001b[1;32m 824\u001b[0m fargs \u001b[38;5;241m=\u001b[39m [args[x\u001b[38;5;241m.\u001b[39mi] \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(x, _Arg) \u001b[38;5;28;01melse\u001b[39;00m x \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpargs] \u001b[38;5;241m+\u001b[39m args[\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmaxi\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m1\u001b[39m:]\n\u001b[0;32m--> 825\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:175\u001b[0m, in \u001b[0;36mLearner._call_one\u001b[0;34m(self, event_name)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_call_one\u001b[39m(\u001b[38;5;28mself\u001b[39m, event_name):\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(event, event_name): \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmissing \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m cb \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcbs\u001b[38;5;241m.\u001b[39msorted(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124morder\u001b[39m\u001b[38;5;124m'\u001b[39m): \u001b[43mcb\u001b[49m\u001b[43m(\u001b[49m\u001b[43mevent_name\u001b[49m\u001b[43m)\u001b[49m\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/callback/core.py:62\u001b[0m, in \u001b[0;36mCallback.__call__\u001b[0;34m(self, event_name)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: res \u001b[38;5;241m=\u001b[39m getcallable(\u001b[38;5;28mself\u001b[39m, event_name)()\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (CancelBatchException, CancelBackwardException, CancelEpochException, CancelFitException, CancelStepException, CancelTrainException, CancelValidException): \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[0;32m---> 62\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e: \u001b[38;5;28;01mraise\u001b[39;00m modify_exception(e, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mException occured in `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` when calling event `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39margs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m, replace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 63\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m event_name\u001b[38;5;241m==\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mafter_fit\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m#Reset self.run to True at each end of fit\u001b[39;00m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m res\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/callback/core.py:60\u001b[0m, in \u001b[0;36mCallback.__call__\u001b[0;34m(self, event_name)\u001b[0m\n\u001b[1;32m 58\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun \u001b[38;5;129;01mand\u001b[39;00m _run: \n\u001b[0;32m---> 60\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: res \u001b[38;5;241m=\u001b[39m \u001b[43mgetcallable\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mevent_name\u001b[49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 61\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (CancelBatchException, CancelBackwardException, CancelEpochException, CancelFitException, CancelStepException, CancelTrainException, CancelValidException): \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e: \u001b[38;5;28;01mraise\u001b[39;00m modify_exception(e, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mException occured in `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` when calling event `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mevent_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39margs[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m, replace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:551\u001b[0m, in \u001b[0;36mRecorder.after_batch\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39myb) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m: \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 550\u001b[0m mets \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_mets \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_valid_mets\n\u001b[0;32m--> 551\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m met \u001b[38;5;129;01min\u001b[39;00m mets: \u001b[43mmet\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maccumulate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlearn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 552\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtraining: \u001b[38;5;28;01mreturn\u001b[39;00m\n\u001b[1;32m 553\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlrs\u001b[38;5;241m.\u001b[39mappend(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mopt\u001b[38;5;241m.\u001b[39mhypers[\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m][\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mlr\u001b[39m\u001b[38;5;124m'\u001b[39m])\n", | |
"File \u001b[0;32m~/workspace/fastai/fastai/learner.py:473\u001b[0m, in \u001b[0;36mAvgMetric.accumulate\u001b[0;34m(self, learn)\u001b[0m\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21maccumulate\u001b[39m(\u001b[38;5;28mself\u001b[39m, learn):\n\u001b[1;32m 472\u001b[0m bs \u001b[38;5;241m=\u001b[39m find_bs(learn\u001b[38;5;241m.\u001b[39myb)\n\u001b[0;32m--> 473\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtotal \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m learn\u001b[38;5;241m.\u001b[39mto_detach(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlearn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mlearn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43myb\u001b[49m\u001b[43m)\u001b[49m)\u001b[38;5;241m*\u001b[39mbs\n\u001b[1;32m 474\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcount \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m bs\n", | |
"Cell \u001b[0;32mIn [9], line 2\u001b[0m, in \u001b[0;36mbroken_error_rate\u001b[0;34m(inp, targ, axis)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mbroken_error_rate\u001b[39m(inp, targ, axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m):\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;241;43m1\u001b[39;49m\u001b[38;5;241;43m/\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\n", | |
"\u001b[0;31mZeroDivisionError\u001b[0m: Exception occured in `Recorder` when calling event `after_batch`:\n\tdivision by zero" | |
] | |
} | |
], | |
"source": [ | |
"with nbdev_allow_exception(): train(convnext_large, accum=4, raise_exception=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"id": "a00f4e19", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"sys.last_traceback detected recovering.\n", | |
"Recovered: 4712 MB\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"\n", | |
"<style>\n", | |
" /* Turns off some styling */\n", | |
" progress {\n", | |
" /* gets rid of default border in Firefox and Opera. */\n", | |
" border: none;\n", | |
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n", | |
" background-size: auto;\n", | |
" }\n", | |
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n", | |
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n", | |
" }\n", | |
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n", | |
" background: #F44336;\n", | |
" }\n", | |
"</style>\n" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: left;\">\n", | |
" <th>epoch</th>\n", | |
" <th>train_loss</th>\n", | |
" <th>valid_loss</th>\n", | |
" <th>error_rate</th>\n", | |
" <th>time</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <td>0</td>\n", | |
" <td>3.974687</td>\n", | |
" <td>3.260266</td>\n", | |
" <td>0.950000</td>\n", | |
" <td>00:02</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>" | |
], | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"train(convnext_large, accum=4, cbs=[CleanUpUnhandledException] )" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"id": "343bf9c2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Are already on 7?\n", | |
"Counter({'cuda': 9})\n", | |
"Reserved mem in use: 8 MB\n", | |
"|===========================================================================|\n", | |
"| PyTorch CUDA memory summary, device ID 0 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| CUDA OOMs: 0 | cudaMalloc retries: 0 |\n", | |
"|===========================================================================|\n", | |
"| Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Allocated memory | 3584 B | 6745 MB | 711922 MB | 711922 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Active memory | 3584 B | 6745 MB | 711922 MB | 711922 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| GPU reserved memory | 8192 KB | 7338 MB | 42198 MB | 42190 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Non-releasable memory | 8188 KB | 594380 KB | 303215 MB | 303207 MB |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Allocations | 7 | 1957 | 100235 | 100228 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Active allocs | 7 | 1957 | 100235 | 100228 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| GPU reserved segments | 4 | 349 | 1950 | 1946 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Non-releasable allocs | 11 | 289 | 67200 | 67189 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Oversize allocations | 0 | 0 | 0 | 0 |\n", | |
"|---------------------------------------------------------------------------|\n", | |
"| Oversize GPU segments | 0 | 0 | 0 | 0 |\n", | |
"|===========================================================================|\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"print_tensor_count(\"Are already on 7?\")\n", | |
"print(torch.cuda.memory_summary(abbreviated=True))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "864f1ca4", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3.9.6 64-bit", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.6" | |
}, | |
"vscode": { | |
"interpreter": { | |
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" | |
} | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment