pmineiro · February 17, 2022 17:39
diff --git a/mnistactiondependence.ipynb b/mnistactiondependence.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "287318ac",
   "metadata": {},
   "source": [
    "# Supervised"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "7fb42e91",
   "metadata": {
    "code_folding": [
     0,
     6,
     35,
     36,
     58,
     63,
     70,
     84,
     114,
     121
    ],
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n    \tmean    \tsince   \tacc     \tsince   \n",
      "1    \t2.30257 \t2.30257 \t0.15625 \t0.15625 \n",
      "2    \t2.40082 \t2.49907 \t0.21875 \t0.28125 \n",
      "3    \t2.21948 \t1.85680 \t0.29167 \t0.43750 \n",
      "5    \t1.95383 \t1.55536 \t0.40000 \t0.56250 \n",
      "9    \t1.56551 \t1.08011 \t0.50694 \t0.64062 \n",
      "17   \t1.18324 \t0.75318 \t0.63419 \t0.77734 \n",
      "33   \t0.87001 \t0.53720 \t0.73059 \t0.83301 \n",
      "65   \t0.65744 \t0.43823 \t0.79615 \t0.86377 \n",
      "129  \t0.49248 \t0.32494 \t0.84726 \t0.89917 \n",
      "257  \t0.39149 \t0.28972 \t0.87840 \t0.90979 \n",
      "513  \t0.31768 \t0.24357 \t0.90244 \t0.92657 \n",
      "938  \t0.27401 \t0.22131 \t0.91613 \t0.93265 \n",
      "testacc 0.9558735489845276 testloss 0.14295095205307007\n"
     ]
    }
   ],
   "source": [
    "def supervisedLearn():\n",
    "    import itertools\n",
    "    import numpy\n",
    "    import torch\n",
    "    import torchvision\n",
    "    \n",
    "    class EasyAcc:\n",
    "        def __init__(self):\n",
    "            self.n = 0\n",
    "            self.sum = 0\n",
    "            self.sumsq = 0\n",
    "\n",
    "        def __iadd__(self, other):\n",
    "            self.n += 1\n",
    "            self.sum += other\n",
    "            self.sumsq += other*other\n",
    "            return self\n",
    "\n",
    "        def __isub__(self, other):\n",
    "            self.n += 1\n",
    "            self.sum -= other\n",
    "            self.sumsq += other*other\n",
    "            return self\n",
    "\n",
    "        def mean(self):\n",
    "            return self.sum / max(self.n, 1)\n",
    "\n",
    "        def var(self):\n",
    "            from math import sqrt\n",
    "            return sqrt(self.sumsq / max(self.n, 1) - self.mean()**2)\n",
    "\n",
    "        def semean(self):\n",
    "            from math import sqrt\n",
    "            return self.var() / sqrt(max(self.n, 1))\n",
    "\n",
    "    class RFFSoftmax(torch.nn.Module):\n",
    "        def __init__(self, hilo, naction, numrff, sigma, seed):\n",
    "            from math import pi\n",
    "            import numpy as np\n",
    "\n",
    "            super(RFFSoftmax, self).__init__()\n",
    "\n",
    "            torch.manual_seed(seed)\n",
    "            nobs = hilo.shape[1]\n",
    "            high = hilo[1, :]\n",
    "            low = hilo[0, :]\n",
    "                    \n",
    "            self.rff = torch.nn.Linear(nobs, numrff)\n",
    "            self.rff.weight.data = torch.matmul(torch.empty(numrff, nobs).cauchy_(sigma = sigma), \n",
    "                                                torch.diag(torch.tensor([ 1.0/v if v > 1e-6 else 0. for v in high - low ])).float())\n",
    "            self.rff.weight.requires_grad = False\n",
    "            self.rff.bias.data = 2 * pi * torch.rand(numrff)\n",
    "            self.rff.bias.requires_grad = False\n",
    "            self.sqrtrff = np.sqrt(numrff)\n",
    "            self.final = torch.nn.Linear(numrff, naction)\n",
    "            self.final.weight.data *= 0.01\n",
    "            self.final.bias.data *= 0.01\n",
    "\n",
    "        def logits(self, x):\n",
    "            with torch.no_grad():\n",
    "                rff = self.rff(x).cos() / self.sqrtrff\n",
    "            return self.final(rff)\n",
    "\n",
    "    transform = torchvision.transforms.Compose([\n",
    "        torchvision.transforms.ToTensor(),\n",
    "        torchvision.transforms.Normalize((0.1307,), (0.3081,))\n",
    "    ])\n",
    "    mnist_train = torchvision.datasets.MNIST('/tmp/mnist', train=True, download=True, transform=transform)\n",
    "    \n",
    "    quantile_loader = torch.utils.data.DataLoader(mnist_train, batch_size=10000, shuffle=True)\n",
    "    for bno, (images, labels) in enumerate(quantile_loader):\n",
    "        flat = images.reshape(images.shape[0], -1)\n",
    "        hilo = numpy.quantile(flat.numpy(), [ 0.01, 0.99 ], axis=0)\n",
    "        pi = RFFSoftmax(hilo, 10, 2000, 0.01, 45)\n",
    "        break\n",
    "    \n",
    "    train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=64, shuffle=True)\n",
    "    mnist_test = torchvision.datasets.MNIST('/tmp/mnist', train=False, download=True, transform=transform)\n",
    "    test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=1000, shuffle=True)\n",
    "    \n",
    "    opt = torch.optim.Adam(( p for p in pi.parameters() if p.requires_grad ), lr=0.1)\n",
    "    loss = torch.nn.CrossEntropyLoss()\n",
    "    acc, accsincelast, avloss, avlosssincelast = [ EasyAcc() for _ in range(4) ]\n",
    "    \n",
    "    print('{:<5s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}'.format(\n",
    "            'n', 'mean', 'since',\n",
    "            'acc', 'since',\n",
    "            ),\n",
    "           flush=True)\n",
    "    \n",
    "    for bno, (images, labels) in enumerate(itertools.chain(*[ train_loader for _ in range(1) ])):\n",
    "        flat = images.reshape(images.shape[0], -1)\n",
    "        \n",
    "        opt.zero_grad()\n",
    "        ld = pi.logits(flat)\n",
    "        output = loss(ld, labels)\n",
    "        output.backward()\n",
    "        opt.step()\n",
    "        \n",
    "        with torch.no_grad():\n",
    "            pred = ld.argmax(dim=1)\n",
    "            acc += torch.mean((labels == pred).float())\n",
    "            accsincelast += torch.mean((labels == pred).float())\n",
    "            avloss += output\n",
    "            avlosssincelast += output\n",
    "\n",
    "        if (bno & (bno - 1) == 0):\n",
    "            print('{:<5d}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}'.format(\n",
    "                             avloss.n, avloss.mean(), avlosssincelast.mean(), \n",
    "                             acc.mean(), accsincelast.mean(), \n",
    "                        ),\n",
    "                       flush=True)\n",
    "            accsincelast, avlosssincelast = EasyAcc(), EasyAcc()\n",
    "                         \n",
    "    print('{:<5d}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}'.format(\n",
    "                     avloss.n, avloss.mean(), avlosssincelast.mean(), \n",
    "                     acc.mean(), accsincelast.mean(), \n",
    "                ),\n",
    "               flush=True)\n",
    "    accsincelast, avlosssincelast = EasyAcc(), EasyAcc()\n",
    "    testacc, testloss = EasyAcc(), EasyAcc()\n",
    "    with torch.no_grad():\n",
    "        for ti, tl in train_loader:\n",
    "            flat = ti.reshape(ti.shape[0], -1)\n",
    "            ld = pi.logits(flat)\n",
    "            output = loss(ld, tl)\n",
    "            testloss += output\n",
    "            testpred = ld.argmax(dim=1)\n",
    "            testacc += torch.mean((tl == testpred).float())\n",
    "\n",
    "        print(f'testacc {testacc.mean()} testloss {testloss.mean()}')\n",
    "    \n",
    "supervisedLearn()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "55cda05e",
   "metadata": {},
   "source": [
    "# CB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "de5fa0ff",
   "metadata": {
    "code_folding": [
     0,
     6,
     7,
     10,
     20,
     49,
     50,
     73,
     78
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n    \tloss    \tsince   \tacc     \tsince   \treward  \tsince   \n",
      "1    \t0.69313 \t0.69313 \t0.15625 \t0.15625 \t0.06250 \t0.06250 \n",
      "2    \t0.55508 \t0.41703 \t0.10156 \t0.04688 \t0.05469 \t0.04688 \n",
      "3    \t0.46854 \t0.29545 \t0.09896 \t0.09375 \t0.05729 \t0.06250 \n",
      "5    \t0.46672 \t0.46399 \t0.11563 \t0.14062 \t0.10000 \t0.16406 \n",
      "9    \t0.42025 \t0.36216 \t0.18229 \t0.26562 \t0.16493 \t0.24609 \n",
      "17   \t0.37235 \t0.31846 \t0.22151 \t0.26562 \t0.20956 \t0.25977 \n",
      "33   \t0.34175 \t0.30924 \t0.27415 \t0.33008 \t0.26089 \t0.31543 \n",
      "65   \t0.34546 \t0.34929 \t0.38726 \t0.50391 \t0.36250 \t0.46729 \n",
      "129  \t0.31504 \t0.28415 \t0.61216 \t0.84058 \t0.56468 \t0.77002 \n",
      "257  \t0.26828 \t0.22116 \t0.75298 \t0.89490 \t0.69127 \t0.81885 \n",
      "513  \t0.22252 \t0.17659 \t0.83458 \t0.91650 \t0.76459 \t0.83820 \n",
      "938  \t0.19493 \t0.16162 \t0.87785 \t0.93007 \t0.80360 \t0.85070 \n",
      "testacc 0.9445962309837341\n"
     ]
    }
   ],
   "source": [
    "def cbLearn():\n",
    "    import itertools\n",
    "    import numpy\n",
    "    import torch\n",
    "    import torchvision\n",
    "    \n",
    "    class FastCB:\n",
    "        def __init__(self, gamma):\n",
    "            self.gamma = gamma\n",
    "\n",
    "        def sample(self, fhat):\n",
    "            N, K = fhat.shape\n",
    "            rando = torch.randint(high=K, size=(N, 1), device=fhat.device)\n",
    "            fhatstar, ahatstar = torch.max(fhat, dim=1, keepdim=True)\n",
    "            fhatrando = torch.gather(input=fhat, dim=1, index=rando)\n",
    "            probs = K / (K + self.gamma * (1 - fhatrando / fhatstar))\n",
    "            unif = torch.rand(size=(N, 1), device=fhat.device)\n",
    "            shouldexplore = (unif <= probs).long()\n",
    "            return (ahatstar + shouldexplore * (rando - ahatstar)).squeeze(1)\n",
    "\n",
    "    class EasyAcc:\n",
    "        def __init__(self):\n",
    "            self.n = 0\n",
    "            self.sum = 0\n",
    "            self.sumsq = 0\n",
    "\n",
    "        def __iadd__(self, other):\n",
    "            self.n += 1\n",
    "            self.sum += other\n",
    "            self.sumsq += other*other\n",
    "            return self\n",
    "\n",
    "        def __isub__(self, other):\n",
    "            self.n += 1\n",
    "            self.sum -= other\n",
    "            self.sumsq += other*other\n",
    "            return self\n",
    "\n",
    "        def mean(self):\n",
    "            return self.sum / max(self.n, 1)\n",
    "\n",
    "        def var(self):\n",
    "            from math import sqrt\n",
    "            return sqrt(self.sumsq / max(self.n, 1) - self.mean()**2)\n",
    "\n",
    "        def semean(self):\n",
    "            from math import sqrt\n",
    "            return self.var() / sqrt(max(self.n, 1))\n",
    "\n",
    "    class RFFSoftmax(torch.nn.Module):\n",
    "        def __init__(self, hilo, naction, numrff, sigma, seed):\n",
    "            from math import pi\n",
    "            import numpy as np\n",
    "\n",
    "            super(RFFSoftmax, self).__init__()\n",
    "\n",
    "            torch.manual_seed(seed)\n",
    "            nobs = hilo.shape[1]\n",
    "            high = hilo[1, :]\n",
    "            low = hilo[0, :]\n",
    "                    \n",
    "            self.rff = torch.nn.Linear(nobs, numrff)\n",
    "            self.rff.weight.data = torch.matmul(torch.empty(numrff, nobs).cauchy_(sigma = sigma), \n",
    "                                                torch.diag(torch.tensor([ 1.0/v if v > 1e-6 else 0. for v in high - low ])).float())\n",
    "            self.rff.weight.requires_grad = False\n",
    "            self.rff.bias.data = 2 * pi * torch.rand(numrff)\n",
    "            self.rff.bias.requires_grad = False\n",
    "            self.sqrtrff = np.sqrt(numrff)\n",
    "            self.final = torch.nn.Linear(numrff, naction)\n",
    "            self.final.weight.data *= 0.01\n",
    "            self.final.bias.data *= 0.01\n",
    "            self.sigmoid = torch.nn.Sigmoid()\n",
    "\n",
    "        def forward(self, x):\n",
    "            with torch.no_grad():\n",
    "                rff = self.rff(x).cos() / self.sqrtrff\n",
    "            return self.final(rff)\n",
    "        \n",
    "        def density(self, logits):\n",
    "            return self.sigmoid(logits)\n",
    "\n",
    "    transform = torchvision.transforms.Compose([\n",
    "        torchvision.transforms.ToTensor(),\n",
    "        torchvision.transforms.Normalize((0.1307,), (0.3081,))\n",
    "    ])\n",
    "    mnist_train = torchvision.datasets.MNIST('/tmp/mnist', train=True, download=True, transform=transform)\n",
    "    \n",
    "    quantile_loader = torch.utils.data.DataLoader(mnist_train, batch_size=10000, shuffle=True)\n",
    "    for bno, (images, labels) in enumerate(quantile_loader):\n",
    "        flat = images.reshape(images.shape[0], -1)\n",
    "        hilo = numpy.quantile(flat.numpy(), [ 0.01, 0.99 ], axis=0)\n",
    "        pi = RFFSoftmax(hilo, 10, 2000, 0.01, 45)\n",
    "        break\n",
    "        \n",
    "    train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=64, shuffle=True)\n",
    "    mnist_test = torchvision.datasets.MNIST('/tmp/mnist', train=False, download=True, transform=transform)\n",
    "    test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=1000, shuffle=True)\n",
    "    sampler = FastCB(gamma=100)\n",
    "    \n",
    "    opt = torch.optim.Adam(( p for p in pi.parameters() if p.requires_grad ), lr=1e-1)\n",
    "    log_loss = torch.nn.BCEWithLogitsLoss()\n",
    "    acc, accsincelast, avloss, avlosssincelast, avreward, avrewardsincelast = [ EasyAcc() for _ in range(6) ]\n",
    "    \n",
    "    print('{:<5s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}'.format(\n",
    "                     'n', 'loss', 'since', \n",
    "                     'acc', 'since',\n",
    "                     'reward', 'since',\n",
    "                ),\n",
    "               flush=True)\n",
    "    \n",
    "    for bno, (images, labels) in enumerate(itertools.chain(*[ train_loader for _ in range(1) ])):\n",
    "        flat = images.reshape(images.shape[0], -1)\n",
    "        \n",
    "        opt.zero_grad()\n",
    "        logit = pi(flat)\n",
    "        with torch.no_grad():\n",
    "            fhat = pi.density(logit)\n",
    "            sample = sampler.sample(fhat)\n",
    "            reward = (sample == labels).unsqueeze(1).float()\n",
    "        \n",
    "        samplelogit = torch.gather(input=logit, index=sample.unsqueeze(1), dim=1)\n",
    "        loss = log_loss(samplelogit, reward)\n",
    "        loss.backward()\n",
    "        opt.step()\n",
    "        \n",
    "        with torch.no_grad():\n",
    "            pred = logit.argmax(dim=1)\n",
    "            acc += torch.mean((labels == pred).float())\n",
    "            accsincelast += torch.mean((labels == pred).float())\n",
    "            avloss += loss\n",
    "            avlosssincelast += loss\n",
    "            avreward += torch.mean(reward)\n",
    "            avrewardsincelast += torch.mean(reward)\n",
    "            \n",
    "        if (bno & (bno - 1) == 0):\n",
    "            print('{:<5d}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}'.format(\n",
    "                             avloss.n, avloss.mean(), avlosssincelast.mean(), \n",
    "                             acc.mean(), accsincelast.mean(), \n",
    "                             avreward.mean(), avrewardsincelast.mean(),\n",
    "                        ),\n",
    "                       flush=True)\n",
    "            accsincelast, avlosssincelast, avrewardsincelast = EasyAcc(), EasyAcc(), EasyAcc()\n",
    "                \n",
    "    print('{:<5d}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}'.format(\n",
    "                     avloss.n, avloss.mean(), avlosssincelast.mean(), \n",
    "                     acc.mean(), accsincelast.mean(), \n",
    "                     avreward.mean(), avrewardsincelast.mean(),\n",
    "                ),\n",
    "               flush=True)\n",
    "    accsincelast, avlosssincelast, avrewardsincelast = EasyAcc(), EasyAcc(), EasyAcc()\n",
    "    testacc = EasyAcc()\n",
    "    with torch.no_grad():\n",
    "        for ti, tl in train_loader:\n",
    "            flat = ti.reshape(ti.shape[0], -1)\n",
    "            logit = pi(flat)\n",
    "            testpred = logit.argmax(dim=1)\n",
    "            testacc += torch.mean((tl == testpred).float())\n",
    "\n",
    "        print(f'testacc {testacc.mean()}')\n",
    "\n",
    "cbLearn()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09186826",
   "metadata": {},
   "source": [
    "# IGL ($y_a \\perp x, a|r_a$)\n",
    "$y_a$ is a (randomly selected) \"zero\" image or a (randomly selected) \"one\" image depending only upon $r_a$."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "67459e03",
   "metadata": {
    "code_folding": [
     0,
     6,
     22,
     51,
     83,
     90,
     100,
     113,
     139,
     188,
     197
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n    \tloss    \tsince   \tacc     \tsince   \treward  \tsince   \tfake    \tsince   \n",
      "1    \t1.38627 \t1.38627 \t0.32812 \t0.32812 \t0.09375 \t0.09375 \t0.49997 \t0.49997 \n",
      "2    \t1.37295 \t1.35963 \t0.18750 \t0.04688 \t0.07812 \t0.06250 \t0.48236 \t0.46476 \n",
      "3    \t1.35572 \t1.32125 \t0.17188 \t0.14062 \t0.08333 \t0.09375 \t0.46612 \t0.43364 \n",
      "5    \t1.33542 \t1.30497 \t0.14687 \t0.10938 \t0.09375 \t0.10938 \t0.43557 \t0.38975 \n",
      "9    \t1.28511 \t1.22223 \t0.14583 \t0.14453 \t0.11111 \t0.13281 \t0.38302 \t0.31733 \n",
      "17   \t1.19374 \t1.09095 \t0.15533 \t0.16602 \t0.11949 \t0.12891 \t0.31242 \t0.23300 \n",
      "33   \t1.19390 \t1.19407 \t0.16714 \t0.17969 \t0.12689 \t0.13477 \t0.26854 \t0.22192 \n",
      "65   \t1.25434 \t1.31667 \t0.25024 \t0.33594 \t0.18438 \t0.24365 \t0.43038 \t0.59728 \n",
      "129  \t1.12546 \t0.99457 \t0.45094 \t0.65479 \t0.33285 \t0.48364 \t0.54036 \t0.65205 \n",
      "257  \t0.88065 \t0.63392 \t0.65394 \t0.85852 \t0.51283 \t0.69421 \t0.65974 \t0.78006 \n",
      "513  \t0.68728 \t0.49315 \t0.77656 \t0.89966 \t0.63411 \t0.75586 \t0.74226 \t0.82510 \n",
      "938  \t0.56352 \t0.41413 \t0.84097 \t0.91871 \t0.70281 \t0.78574 \t0.78930 \t0.84607 \n",
      "testacc 0.928521454334259\n"
     ]
    }
   ],
   "source": [
    "def iglLearn():\n",
    "    import itertools\n",
    "    import numpy\n",
    "    import torch\n",
    "    import torchvision\n",
    "    \n",
    "    class SquareCB(object):\n",
    "        def __init__(self, gamma):\n",
    "            super(SquareCB, self).__init__()\n",
    "\n",
    "            self.gamma = gamma\n",
    "\n",
    "        def sample(self, fhat):\n",
    "            N, K = fhat.shape\n",
    "            rando = torch.randint(high=K, size=(N, 1), device=fhat.device)\n",
    "            fhatstar, ahatstar = torch.max(fhat, dim=1, keepdim=True)\n",
    "            fhatrando = torch.gather(input=fhat, dim=1, index=rando)\n",
    "            probs = K / (K + self.gamma * (fhatstar - fhatrando))\n",
    "            unif = torch.rand(size=(N, 1), device=fhat.device)\n",
    "            shouldexplore = (unif <= probs).long()\n",
    "            return (ahatstar + shouldexplore * (rando - ahatstar)).squeeze(1)\n",
    "    \n",
    "    class EasyAcc:\n",
    "        def __init__(self):\n",
    "            self.n = 0\n",
    "            self.sum = 0\n",
    "            self.sumsq = 0\n",
    "\n",
    "        def __iadd__(self, other):\n",
    "            self.n += 1\n",
    "            self.sum += other\n",
    "            self.sumsq += other*other\n",
    "            return self\n",
    "\n",
    "        def __isub__(self, other):\n",
    "            self.n += 1\n",
    "            self.sum -= other\n",
    "            self.sumsq += other*other\n",
    "            return self\n",
    "\n",
    "        def mean(self):\n",
    "            return self.sum / max(self.n, 1)\n",
    "\n",
    "        def var(self):\n",
    "            from math import sqrt\n",
    "            return sqrt(self.sumsq / max(self.n, 1) - self.mean()**2)\n",
    "\n",
    "        def semean(self):\n",
    "            from math import sqrt\n",
    "            return self.var() / sqrt(max(self.n, 1))\n",
    "\n",
    "    class RFFSoftmax(torch.nn.Module):\n",
    "        def __init__(self, hilo, naction, numrff, sigma, seed):\n",
    "            from math import pi\n",
    "            import numpy as np\n",
    "\n",
    "            super(RFFSoftmax, self).__init__()\n",
    "\n",
    "            torch.manual_seed(seed)\n",
    "            nobs = hilo.shape[1]\n",
    "            high = hilo[1, :]\n",
    "            low = hilo[0, :]\n",
    "                    \n",
    "            self.rff = torch.nn.Linear(nobs, numrff)\n",
    "            self.rff.weight.data = torch.matmul(torch.empty(numrff, nobs).cauchy_(sigma = sigma), \n",
    "                                                torch.diag(torch.tensor([ 1.0/v if v > 1e-6 else 0. for v in high - low ])).float())\n",
    "            self.rff.weight.requires_grad = False\n",
    "            self.rff.bias.data = 2 * pi * torch.rand(numrff)\n",
    "            self.rff.bias.requires_grad = False\n",
    "            self.sqrtrff = np.sqrt(numrff)\n",
    "            self.final = torch.nn.Linear(numrff, naction)\n",
    "            self.final.weight.data *= 0.01\n",
    "            self.final.bias.data *= 0.01\n",
    "            self.sigmoid = torch.nn.Sigmoid()\n",
    "\n",
    "        def forward(self, x):\n",
    "            with torch.no_grad():\n",
    "                rff = self.rff(x).cos() / self.sqrtrff\n",
    "            return self.final(rff)\n",
    "        \n",
    "        def density(self, logits):\n",
    "            return self.sigmoid(logits)\n",
    "\n",
    "    transform = torchvision.transforms.Compose([\n",
    "        torchvision.transforms.ToTensor(),\n",
    "        torchvision.transforms.Normalize((0.1307,), (0.3081,))\n",
    "    ])\n",
    "    mnist_train = torchvision.datasets.MNIST('/tmp/mnist', train=True, download=True, transform=transform)\n",
    "    \n",
    "    quantile_loader = torch.utils.data.DataLoader(mnist_train, batch_size=10000, shuffle=True)\n",
    "    for bno, (images, labels) in enumerate(quantile_loader):\n",
    "        flat = images.reshape(images.shape[0], -1)\n",
    "        hilo = numpy.quantile(flat.numpy(), [ 0.01, 0.99 ], axis=0)\n",
    "        pi = RFFSoftmax(hilo, 10, 2000, 0.01, 45)\n",
    "        decoder = RFFSoftmax(hilo, 1, 2000, 0.01, 2112)\n",
    "        break\n",
    "        \n",
    "    zero_one_loader = torch.utils.data.DataLoader(mnist_train, batch_size=1, shuffle=True)\n",
    "    zeros = []\n",
    "    ones = []\n",
    "    for bno, (images, labels) in enumerate(zero_one_loader):\n",
    "        flat = images.reshape(images.shape[0], -1)\n",
    "        if labels[0] == 0:\n",
    "            zeros.append(flat)\n",
    "        elif labels[0] == 1:\n",
    "            ones.append(flat)\n",
    "            \n",
    "        if len(zeros) > 100 and len(ones) > 100:\n",
    "            break      \n",
    "    zeros = torch.cat(zeros, dim=0)\n",
    "    ones = torch.cat(ones, dim=0)\n",
    "        \n",
    "    # pre-train to get policy \"better than random\"\n",
    "    if True:\n",
    "        preopt = torch.optim.Adam(( p for p in pi.parameters() if p.requires_grad ), lr=1e-3) # 0.1\n",
    "        preloss = torch.nn.CrossEntropyLoss()\n",
    "        pretrain_loader = torch.utils.data.DataLoader(mnist_train, batch_size=64, shuffle=True)\n",
    "        for bno, (images, labels) in enumerate(itertools.chain(*[ pretrain_loader for _ in range(1) ])):\n",
    "            flat = images.reshape(images.shape[0], -1)\n",
    "\n",
    "            preopt.zero_grad()\n",
    "            ld = pi.forward(flat)\n",
    "            output = preloss(ld, labels)\n",
    "            output.backward()\n",
    "            preopt.step()\n",
    "\n",
    "            if bno > 0:\n",
    "                break\n",
    "        \n",
    "    train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=64, shuffle=True)\n",
    "    mnist_test = torchvision.datasets.MNIST('/tmp/mnist', train=False, download=True, transform=transform)\n",
    "    test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=1000, shuffle=True)\n",
    "    \n",
    "    opt = torch.optim.Adam(( p for p in itertools.chain(pi.parameters(), decoder.parameters()) if p.requires_grad ), lr=1e-2)\n",
    "    log_loss = torch.nn.BCEWithLogitsLoss(reduce='none')\n",
    "    sampler = SquareCB(gamma=100)\n",
    "    acc, accsincelast, avloss, avlosssincelast = [ EasyAcc() for _ in range(4) ]\n",
    "    avreward, avrewardsincelast, avfake, avfakesincelast = [ EasyAcc() for _ in range(4) ]\n",
    "    \n",
    "    print('{:<5s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}'.format(\n",
    "                     'n', 'loss', 'since', \n",
    "                     'acc', 'since',\n",
    "                     'reward', 'since',\n",
    "                     'fake', 'since',\n",
    "                ),\n",
    "               flush=True)\n",
    "    \n",
    "    for bno, (images, labels) in enumerate(itertools.chain(*[ train_loader for _ in range(1) ])):\n",
    "        flat = images.reshape(images.shape[0], -1)\n",
    "        \n",
    "        opt.zero_grad()\n",
    "        logit = pi(flat)\n",
    "        with torch.no_grad():\n",
    "            fhat = pi.density(logit)\n",
    "            sample = sampler.sample(fhat)\n",
    "            reward = (sample == labels).unsqueeze(1).float()\n",
    "            pred = logit.argmax(dim=1)\n",
    "            ispred = (sample == pred).unsqueeze(1).float()\n",
    "            antipred = logit.argmin(dim=1)\n",
    "            isantipred = (sample == antipred).unsqueeze(1).float()\n",
    "            zerossample = torch.randint(low=0, high=zeros.shape[0], size=(fhat.shape[0], 1))\n",
    "            zerofeedback = torch.gather(input=zeros, index=zerossample.expand(-1, zeros.shape[1]), dim=0)\n",
    "            onessample = torch.randint(low=0, high=ones.shape[0], size=(fhat.shape[0], 1))\n",
    "            onefeedback = torch.gather(input=ones, index=onessample.expand(-1, ones.shape[1]), dim=0)\n",
    "            feedback = zerofeedback + reward * (onefeedback - zerofeedback)         \n",
    "        \n",
    "        samplelogit = torch.gather(input=logit, index=sample.unsqueeze(1), dim=1)\n",
    "        fakelogit = decoder(feedback)\n",
    "        fakereward = decoder.density(fakelogit)\n",
    "        predloss = torch.mean(log_loss(fakelogit, ispred) + log_loss(samplelogit, fakereward.detach()))\n",
    "        antipredloss = torch.mean(log_loss(1 - fakelogit, isantipred) + log_loss(1 - samplelogit, fakereward.detach()))\n",
    "        loss = torch.min(predloss, antipredloss)\n",
    "\n",
    "        loss.backward()\n",
    "        opt.step()\n",
    "        \n",
    "        with torch.no_grad():\n",
    "            pred = logit.argmax(dim=1)\n",
    "            acc += torch.mean((labels == pred).float())\n",
    "            accsincelast += torch.mean((labels == pred).float())\n",
    "            avloss += loss\n",
    "            avlosssincelast += loss\n",
    "            avreward += torch.mean(reward)\n",
    "            avrewardsincelast += torch.mean(reward)\n",
    "            avfake += torch.mean(fakereward)\n",
    "            avfakesincelast += torch.mean(fakereward)\n",
    "            \n",
    "        if (bno & (bno - 1) == 0):\n",
    "            print('{:<5d}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}'.format(\n",
    "                             avloss.n, avloss.mean(), avlosssincelast.mean(), \n",
    "                             acc.mean(), accsincelast.mean(), \n",
    "                             avreward.mean(), avrewardsincelast.mean(),\n",
    "                             avfake.mean(), avfakesincelast.mean(),\n",
    "                        ),\n",
    "                       flush=True)\n",
    "            accsincelast, avlosssincelast, avrewardsincelast, avfakesincelast = [ EasyAcc() for _ in range(4) ]\n",
    "                \n",
    "    print('{:<5d}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}'.format(\n",
    "                     avloss.n, avloss.mean(), avlosssincelast.mean(), \n",
    "                     acc.mean(), accsincelast.mean(), \n",
    "                     avreward.mean(), avrewardsincelast.mean(),\n",
    "                     avfake.mean(), avfakesincelast.mean(),\n",
    "                ),\n",
    "               flush=True)\n",
    "    accsincelast, avlosssincelast, avrewardsincelast = EasyAcc(), EasyAcc(), EasyAcc()\n",
    "    testacc = EasyAcc()\n",
    "    with torch.no_grad():\n",
    "        for ti, tl in train_loader:\n",
    "            flat = ti.reshape(ti.shape[0], -1)\n",
    "            logit = pi(flat)\n",
    "            testpred = logit.argmax(dim=1)\n",
    "            testacc += torch.mean((tl == testpred).float())\n",
    "\n",
    "        print(f'testacc {testacc.mean()}')\n",
    "\n",
    "iglLearn()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "90994504",
   "metadata": {},
   "source": [
    "# IGL ($y_a \\perp x|r_a$)\n",
    "$y_a$ is an image of the action taken if $r_a = 1$, e.g., if $a=3$, a \"three\" image; otherwise if $r_a = 0$, an image of $(9-a)$, e.g., if $a=3$, a \"six\" image."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 288,
   "id": "4d4e6631",
   "metadata": {
    "code_folding": [
     6,
     16,
     26,
     31,
     55,
     84,
     85,
     108,
     113,
     116,
     117,
     140,
     145,
     148,
     155,
     164,
     172,
     199,
     213,
     233,
     271,
     305,
     314
    ],
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n    \tloss    \tsince   \tacc     \tsince   \treward  \tsince   \tfake    \tsince   \n",
      "1    \t1.38891 \t1.38891 \t0.28125 \t0.28125 \t0.23438 \t0.23438 \t0.49940 \t0.49940 \n",
      "2    \t1.38223 \t1.37554 \t0.29688 \t0.31250 \t0.19531 \t0.15625 \t0.49962 \t0.49983 \n",
      "3    \t1.38101 \t1.37858 \t0.31771 \t0.35938 \t0.18750 \t0.17188 \t0.49995 \t0.50062 \n",
      "5    \t1.38767 \t1.39765 \t0.29063 \t0.25000 \t0.14687 \t0.08594 \t0.49908 \t0.49777 \n",
      "9    \t1.38487 \t1.38138 \t0.19965 \t0.08594 \t0.13368 \t0.11719 \t0.50189 \t0.50540 \n",
      "17   \t1.35851 \t1.32885 \t0.18199 \t0.16211 \t0.13787 \t0.14258 \t0.50388 \t0.50613 \n",
      "33   \t1.34694 \t1.33465 \t0.20028 \t0.21973 \t0.14725 \t0.15723 \t0.50259 \t0.50121 \n",
      "65   \t1.33522 \t1.32314 \t0.27428 \t0.35059 \t0.19447 \t0.24316 \t0.50141 \t0.50020 \n",
      "129  \t1.26931 \t1.20236 \t0.35913 \t0.44531 \t0.26211 \t0.33081 \t0.50427 \t0.50716 \n",
      "257  \t1.14736 \t1.02446 \t0.52420 \t0.69055 \t0.40096 \t0.54089 \t0.53432 \t0.56461 \n",
      "513  \t0.99472 \t0.84148 \t0.65068 \t0.77765 \t0.52537 \t0.65027 \t0.57622 \t0.61829 \n",
      "938  \t0.86970 \t0.71879 \t0.73391 \t0.83438 \t0.61042 \t0.71309 \t0.61330 \t0.65806 \n",
      "testacc 0.8389525413513184\n"
     ]
    }
   ],
   "source": [
    "def iglADepLearn():\n",
    "    import itertools\n",
    "    import numpy\n",
    "    import torch\n",
    "    import torchvision\n",
    "    \n",
    "    class WeightedReservoir(object):\n",
    "        def __init__(self, n, seed):\n",
    "            import random\n",
    "            \n",
    "            super().__init__()\n",
    "            self.n = n\n",
    "            self.items = []\n",
    "            self.wsum = 0\n",
    "            self.gen = random.Random(seed) \n",
    "            \n",
    "        def insert(self, item, weight):\n",
    "            if weight > 0:\n",
    "                self.wsum += weight\n",
    "                if self.wsum * self.gen.random() < weight:\n",
    "                    if len(self.items) < self.n:\n",
    "                        self.items.append(item)\n",
    "                    else:\n",
    "                        index = self.gen.randrange(0, self.n) \n",
    "                        self.items[index] = item\n",
    "                        \n",
    "        def sample(self):\n",
    "            assert len(self.items) > 0\n",
    "            index = self.gen.randrange(0, len(self.items))\n",
    "            return self.items[index]\n",
    "    \n",
    "    class SquareCB(object):\n",
    "        def __init__(self, gamma):\n",
    "            super().__init__()\n",
    "\n",
    "            self.gamma = gamma\n",
    "\n",
    "        def sample(self, fhat, *, keepdim=False):\n",
    "            N, K = fhat.shape\n",
    "            fhatstar, ahatstar = torch.max(fhat, dim=1, keepdim=True)\n",
    "            probs = 1 / (K + self.gamma * (fhatstar - fhat))\n",
    "            psum = torch.sum(probs, dim=1, keepdim=True)\n",
    "            phatstar = psum + torch.gather(input=probs, dim=1, index=ahatstar)\n",
    "\n",
    "            rando = torch.randint(high=K, size=(N, 1), device=fhat.device)\n",
    "            prando = torch.gather(input=probs, dim=1, index=rando)\n",
    "            unif = torch.rand(size=(N, 1), device=fhat.device)\n",
    "            shouldexplore = (unif <= K * prando).long()\n",
    "            actions = ahatstar + shouldexplore * (rando - ahatstar)\n",
    "            pactions = phatstar + shouldexplore * (prando - phatstar)\n",
    "            if not keepdim:\n",
    "                actions = actions.squeeze(1)\n",
    "                pactions = pactions.squeeze(1)\n",
    "            return actions, pactions\n",
    "    \n",
    "    class EasyAcc:\n",
    "        def __init__(self):\n",
    "            self.n = 0\n",
    "            self.sum = 0\n",
    "            self.sumsq = 0\n",
    "\n",
    "        def __iadd__(self, other):\n",
    "            self.n += 1\n",
    "            self.sum += other\n",
    "            self.sumsq += other*other\n",
    "            return self\n",
    "\n",
    "        def __isub__(self, other):\n",
    "            self.n += 1\n",
    "            self.sum -= other\n",
    "            self.sumsq += other*other\n",
    "            return self\n",
    "\n",
    "        def mean(self):\n",
    "            return self.sum / max(self.n, 1)\n",
    "\n",
    "        def var(self):\n",
    "            from math import sqrt\n",
    "            return sqrt(self.sumsq / max(self.n, 1) - self.mean()**2)\n",
    "\n",
    "        def semean(self):\n",
    "            from math import sqrt\n",
    "            return self.var() / sqrt(max(self.n, 1))\n",
    "\n",
    "    class RFFBilinearSoftmax(torch.nn.Module):\n",
    "        def __init__(self, hilo, naction, numrff, sigma, seed):\n",
    "            from math import pi\n",
    "            import numpy as np\n",
    "\n",
    "            super().__init__()\n",
    "\n",
    "            torch.manual_seed(seed)\n",
    "            nobs = hilo.shape[1]\n",
    "            high = hilo[1, :]\n",
    "            low = hilo[0, :]\n",
    "                    \n",
    "            self.rff = torch.nn.Linear(nobs, numrff)\n",
    "            self.rff.weight.data = torch.matmul(torch.empty(numrff, nobs).cauchy_(sigma = sigma), \n",
    "                                                torch.diag(torch.tensor([ 1.0/v if v > 1e-6 else 0. for v in high - low ])).float())\n",
    "            self.rff.weight.requires_grad = False\n",
    "            self.rff.bias.data = 2 * pi * torch.rand(numrff)\n",
    "            self.rff.bias.requires_grad = False\n",
    "            self.sqrtrff = np.sqrt(numrff)\n",
    "            self.final = torch.nn.Bilinear(naction, numrff, 1)\n",
    "            self.final.weight.data *= 0.01\n",
    "            self.final.bias.data *= 0.01\n",
    "            self.sigmoid = torch.nn.Sigmoid()\n",
    "\n",
    "        def forward(self, a, y):\n",
    "            with torch.no_grad():\n",
    "                rff = self.rff(y).cos() / self.sqrtrff\n",
    "            return self.final(a, rff)\n",
    "        \n",
    "        def density(self, logits):\n",
    "            return self.sigmoid(logits)\n",
    "\n",
    "    class RFFSoftmax(torch.nn.Module):\n",
    "        def __init__(self, hilo, naction, numrff, sigma, seed):\n",
    "            from math import pi\n",
    "            import numpy as np\n",
    "\n",
    "            super().__init__()\n",
    "\n",
    "            torch.manual_seed(seed)\n",
    "            nobs = hilo.shape[1]\n",
    "            high = hilo[1, :]\n",
    "            low = hilo[0, :]\n",
    "                    \n",
    "            self.rff = torch.nn.Linear(nobs, numrff)\n",
    "            self.rff.weight.data = torch.matmul(torch.empty(numrff, nobs).cauchy_(sigma = sigma), \n",
    "                                                torch.diag(torch.tensor([ 1.0/v if v > 1e-6 else 0. for v in high - low ])).float())\n",
    "            self.rff.weight.requires_grad = False\n",
    "            self.rff.bias.data = 2 * pi * torch.rand(numrff)\n",
    "            self.rff.bias.requires_grad = False\n",
    "            self.sqrtrff = np.sqrt(numrff)\n",
    "            self.final = torch.nn.Linear(numrff, naction)\n",
    "            self.final.weight.data *= 0.01\n",
    "            self.final.bias.data *= 0.01\n",
    "            self.sigmoid = torch.nn.Sigmoid()\n",
    "\n",
    "        def forward(self, x):\n",
    "            with torch.no_grad():\n",
    "                rff = self.rff(x).cos() / self.sqrtrff\n",
    "            return self.final(rff)\n",
    "        \n",
    "        def preq1(self, logits):\n",
    "            return self.sigmoid(logits)\n",
    "\n",
    "    transform = torchvision.transforms.Compose([\n",
    "        torchvision.transforms.ToTensor(),\n",
    "        torchvision.transforms.Normalize((0.1307,), (0.3081,))\n",
    "    ])\n",
    "    mnist_train = torchvision.datasets.MNIST('/tmp/mnist', train=True, download=True, transform=transform)\n",
    "    \n",
    "    quantile_loader = torch.utils.data.DataLoader(mnist_train, batch_size=10000, shuffle=True)\n",
    "    for bno, (images, labels) in enumerate(quantile_loader):\n",
    "        flat = images.reshape(images.shape[0], -1)\n",
    "        hilo = numpy.quantile(flat.numpy(), [ 0.01, 0.99 ], axis=0)\n",
    "        pi = RFFSoftmax(hilo, 10, 2000, 0.01, 45)\n",
    "        decoder = RFFBilinearSoftmax(hilo, 10, 2000, 0.01, 2112)\n",
    "        break\n",
    "        \n",
    "    feedback_loader = torch.utils.data.DataLoader(mnist_train, batch_size=1, shuffle=True)\n",
    "    feedbacks = [ [] for _ in range(10) ]\n",
    "    for bno, (images, labels) in enumerate(feedback_loader):\n",
    "        flat = images.reshape(images.shape[0], -1)\n",
    "        feedbacks[labels[0]].append(flat)\n",
    "        if all(len(x) > 100 for x in feedbacks):\n",
    "            break      \n",
    "    feedbacks = torch.cat([ torch.cat(x[:100], dim=0).unsqueeze(0) for x in feedbacks ], dim=0)\n",
    "        \n",
    "    # pre-train to get policy \"better than random\"\n",
    "    if True:\n",
    "        preopt = torch.optim.Adam(( p for p in pi.parameters() if p.requires_grad ), lr=1e-2) # 0.1\n",
    "        preloss = torch.nn.CrossEntropyLoss()\n",
    "        pretrain_loader = torch.utils.data.DataLoader(mnist_train, batch_size=64, shuffle=True)\n",
    "        for bno, (images, labels) in enumerate(itertools.chain(*[ pretrain_loader for _ in range(1) ])):\n",
    "            flat = images.reshape(images.shape[0], -1)\n",
    "\n",
    "            preopt.zero_grad()\n",
    "            ld = pi.forward(flat)\n",
    "            output = preloss(ld, labels)\n",
    "            output.backward()\n",
    "            preopt.step()\n",
    "\n",
    "            if bno > 0:\n",
    "                break\n",
    "        \n",
    "    train_loader = torch.utils.data.DataLoader(mnist_train, batch_size=64, shuffle=True)\n",
    "    mnist_test = torchvision.datasets.MNIST('/tmp/mnist', train=False, download=True, transform=transform)\n",
    "    test_loader = torch.utils.data.DataLoader(mnist_test, batch_size=1000, shuffle=True)\n",
    "    \n",
    "    opt = torch.optim.Adam(( p for p in itertools.chain(pi.parameters(), decoder.parameters()) if p.requires_grad ), lr=1e-2)\n",
    "    log_loss = torch.nn.BCEWithLogitsLoss(reduce='none')\n",
    "    sampler = SquareCB(gamma=100)\n",
    "    acc, accsincelast, avloss, avlosssincelast = [ EasyAcc() for _ in range(4) ]\n",
    "    avreward, avrewardsincelast, avfake, avfakesincelast = [ EasyAcc() for _ in range(4) ]\n",
    "    reservoirs = [ WeightedReservoir(20, 1973+a) for a in range(10) ]\n",
    "    \n",
    "    print('{:<5s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}\\t{:<8s}'.format(\n",
    "                     'n', 'loss', 'since', \n",
    "                     'acc', 'since',\n",
    "                     'reward', 'since',\n",
    "                     'fake', 'since',\n",
    "                ),\n",
    "               flush=True)\n",
    "    \n",
    "    for bno, (images, labels) in enumerate(itertools.chain(*[ train_loader for _ in range(1) ])):\n",
    "        flatimage = images.reshape(images.shape[0], -1)\n",
    "        \n",
    "        opt.zero_grad()\n",
    "        logit = pi(flatimage)\n",
    "        \n",
    "        with torch.no_grad():\n",
    "            fhat = pi.preq1(logit)\n",
    "            sample, probs = sampler.sample(fhat, keepdim=True)\n",
    "            \n",
    "            reward = (sample == labels.unsqueeze(1)).float()\n",
    "            pred = logit.argmax(dim=1, keepdim=True)\n",
    "            ispred = (sample == pred).float()\n",
    "            antipred = logit.argmin(dim=1, keepdim=True)\n",
    "            isantipred = (sample == antipred).float()\n",
    "            \n",
    "            # this assumes a particular majorization (Torch tensors are row-major)\n",
    "            bigfeedbacks = feedbacks.unsqueeze(0).expand(fhat.shape[0], -1, -1, -1).reshape(fhat.shape[0], -1, flatimage.shape[1]) # Batch x (A x Rep) x Pixels\n",
    "            nreps = feedbacks.shape[1]\n",
    "            goodwhich = feedbacks.shape[1] * sample.squeeze(1) + torch.randint(low=0, high=feedbacks.shape[1], size=(fhat.shape[0],))\n",
    "            goodwhich = goodwhich.unsqueeze(1).unsqueeze(2).expand(-1, -1, flatimage.shape[1])\n",
    "            goodfeedbacks = torch.gather(input=bigfeedbacks, index=goodwhich, dim=1).squeeze(1)\n",
    "            badwhich = feedbacks.shape[1] * (9-sample).squeeze(1) + torch.randint(low=0, high=feedbacks.shape[1], size=(fhat.shape[0],))\n",
    "            badwhich = badwhich.unsqueeze(1).unsqueeze(2).expand(-1, -1, flatimage.shape[1])\n",
    "            badfeedbacks = torch.gather(input=bigfeedbacks, index=badwhich, dim=1).squeeze(1)\n",
    "            \n",
    "            if False:\n",
    "                import matplotlib.pyplot as plt\n",
    "\n",
    "                fig, axs = plt.subplots(1, 10)\n",
    "                for n, (s, f) in enumerate(zip(sample, goodfeedbacks)):\n",
    "                    if n > 9:\n",
    "                        break\n",
    "                    axs[n].imshow(f.reshape(28, 28))\n",
    "                    axs[n].set_title(f'{s.item()}')\n",
    "            \n",
    "                plt.show()\n",
    "                \n",
    "                fig, axs = plt.subplots(1, 10)\n",
    "                for n, (s, f) in enumerate(zip(sample, badfeedbacks)):\n",
    "                    if n > 9:\n",
    "                        break\n",
    "                    axs[n].imshow(f.reshape(28, 28))\n",
    "                    axs[n].set_title(f'{s.item()}')\n",
    "            \n",
    "                plt.show()\n",
    "                assert False\n",
    "                \n",
    "            feedback = badfeedbacks + reward * (goodfeedbacks - badfeedbacks)\n",
    "            onehotsample = torch.nn.functional.one_hot(sample.squeeze(1), num_classes=fhat.shape[1]).float()\n",
    "            \n",
    "            # insert then sample ... means the first time we play an action there will be no update, that's ok\n",
    "            for s, p, r, f in zip(sample, probs, reward, feedback):\n",
    "                reservoirs[s.item()].insert((f, r), 1/p)\n",
    "            \n",
    "            compfeedback = []\n",
    "            compreward = []\n",
    "            for s in sample:\n",
    "                f, r = reservoirs[s.item()].sample()\n",
    "                compfeedback.append(f.unsqueeze(0))\n",
    "                compreward.append(r.unsqueeze(0))\n",
    "            compfeedback = torch.cat(compfeedback, dim=0)\n",
    "            compreward = torch.cat(compreward, dim=0)\n",
    "            \n",
    "            if False:\n",
    "                import matplotlib.pyplot as plt\n",
    "\n",
    "                fig, axs = plt.subplots(1, 10)\n",
    "                for n, (s, f, r) in enumerate(zip(sample, compfeedback, compreward)):\n",
    "                    if n > 9:\n",
    "                        break\n",
    "                    axs[n].imshow(f.reshape(28, 28))\n",
    "                    axs[n].set_title(f'{s.item()} {r.long().item()}')\n",
    "            \n",
    "                plt.show()\n",
    "                assert False\n",
    "\n",
    "        samplelogit = torch.gather(input=logit, index=sample, dim=1)\n",
    "        fakelogit = decoder(onehotsample, feedback)\n",
    "        fakereward = decoder.density(fakelogit)\n",
    "        fakecomplogit = decoder(onehotsample, compfeedback)\n",
    "        predloss = torch.mean(log_loss(fakelogit - fakecomplogit, ispred) + log_loss(samplelogit, fakereward.detach()))\n",
    "        antipredloss = torch.mean(log_loss(fakecomplogit - fakelogit, isantipred) + log_loss(1 - samplelogit, fakereward.detach()))\n",
    "        loss = torch.min(predloss, antipredloss)\n",
    "        loss.backward()\n",
    "        opt.step()\n",
    "        \n",
    "        with torch.no_grad():\n",
    "            acc += torch.mean((labels.unsqueeze(1) == pred).float())\n",
    "            accsincelast += torch.mean((labels.unsqueeze(1) == pred).float())\n",
    "            avloss += loss\n",
    "            avlosssincelast += loss\n",
    "            avreward += torch.mean(reward)\n",
    "            avrewardsincelast += torch.mean(reward)\n",
    "            avfake += torch.mean(fakereward)\n",
    "            avfakesincelast += torch.mean(fakereward)\n",
    "            \n",
    "        if (bno & (bno - 1) == 0):\n",
    "            print('{:<5d}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}'.format(\n",
    "                             avloss.n, avloss.mean(), avlosssincelast.mean(), \n",
    "                             acc.mean(), accsincelast.mean(), \n",
    "                             avreward.mean(), avrewardsincelast.mean(),\n",
    "                             avfake.mean(), avfakesincelast.mean(),\n",
    "                        ),\n",
    "                       flush=True)\n",
    "            accsincelast, avlosssincelast, avrewardsincelast, avfakesincelast = [ EasyAcc() for _ in range(4) ]\n",
    "                \n",
    "    print('{:<5d}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}\\t{:<8.5f}'.format(\n",
    "                     avloss.n, avloss.mean(), avlosssincelast.mean(), \n",
    "                     acc.mean(), accsincelast.mean(), \n",
    "                     avreward.mean(), avrewardsincelast.mean(),\n",
    "                     avfake.mean(), avfakesincelast.mean(),\n",
    "                ),\n",
    "               flush=True)\n",
    "    accsincelast, avlosssincelast, avrewardsincelast = EasyAcc(), EasyAcc(), EasyAcc()\n",
    "    testacc = EasyAcc()\n",
    "    with torch.no_grad():\n",
    "        for ti, tl in train_loader:\n",
    "            flat = ti.reshape(ti.shape[0], -1)\n",
    "            logit = pi(flat)\n",
    "            testpred = logit.argmax(dim=1)\n",
    "            testacc += torch.mean((tl == testpred).float())\n",
    "\n",
    "        print(f'testacc {testacc.mean()}')\n",
    "\n",
    "iglADepLearn()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef8c19c9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }