Created
May 22, 2018 12:51
-
-
Save jorisvandenbossche/9aa3ff441cb72ff1a6cb11ccc6a1e3ea to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "%matplotlib inline\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "import matplotlib.pyplot as plt" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## The raw dataset has a large number of missing values." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "DATA = \"/storage/store/data/UKBB/\"\n", | |
| "DATA2 = \"/storage/store/data/ramp/ukbb-missing-data/\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_data = pd.read_csv(DATA + \"ukb9543.csv\", nrows=10000, low_memory=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = df_data.copy()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(10000, 1984)" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>eid</th>\n", | |
| " <th>31-0.0</th>\n", | |
| " <th>34-0.0</th>\n", | |
| " <th>52-0.0</th>\n", | |
| " <th>53-0.0</th>\n", | |
| " <th>53-1.0</th>\n", | |
| " <th>53-2.0</th>\n", | |
| " <th>54-0.0</th>\n", | |
| " <th>54-1.0</th>\n", | |
| " <th>54-2.0</th>\n", | |
| " <th>...</th>\n", | |
| " <th>41216-0.1</th>\n", | |
| " <th>41217-0.0</th>\n", | |
| " <th>41217-0.1</th>\n", | |
| " <th>41217-0.2</th>\n", | |
| " <th>41218-0.0</th>\n", | |
| " <th>41218-0.1</th>\n", | |
| " <th>41218-0.2</th>\n", | |
| " <th>41218-0.3</th>\n", | |
| " <th>41218-0.4</th>\n", | |
| " <th>41252-0.0</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1000013</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1947</td>\n", | |
| " <td>1</td>\n", | |
| " <td>2008-03-14</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>11003</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>14.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1000024</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1940</td>\n", | |
| " <td>3</td>\n", | |
| " <td>2009-11-23</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>11014</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>8.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>3.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1000036</td>\n", | |
| " <td>0</td>\n", | |
| " <td>1956</td>\n", | |
| " <td>10</td>\n", | |
| " <td>2009-10-24</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>11018</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>8.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>4.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1000048</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1946</td>\n", | |
| " <td>7</td>\n", | |
| " <td>2008-07-15</td>\n", | |
| " <td>2013-01-03</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>11006</td>\n", | |
| " <td>11024.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1000055</td>\n", | |
| " <td>1</td>\n", | |
| " <td>1964</td>\n", | |
| " <td>3</td>\n", | |
| " <td>2008-08-18</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>11008</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>...</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>8.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>8.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>5 rows × 1984 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " eid 31-0.0 34-0.0 52-0.0 53-0.0 53-1.0 53-2.0 54-0.0 \\\n", | |
| "0 1000013 0 1947 1 2008-03-14 NaN NaN 11003 \n", | |
| "1 1000024 1 1940 3 2009-11-23 NaN NaN 11014 \n", | |
| "2 1000036 0 1956 10 2009-10-24 NaN NaN 11018 \n", | |
| "3 1000048 1 1946 7 2008-07-15 2013-01-03 NaN 11006 \n", | |
| "4 1000055 1 1964 3 2008-08-18 NaN NaN 11008 \n", | |
| "\n", | |
| " 54-1.0 54-2.0 ... 41216-0.1 41217-0.0 41217-0.1 41217-0.2 \\\n", | |
| "0 NaN NaN ... NaN NaN NaN NaN \n", | |
| "1 NaN NaN ... NaN NaN NaN NaN \n", | |
| "2 NaN NaN ... NaN NaN NaN NaN \n", | |
| "3 11024.0 NaN ... NaN NaN NaN NaN \n", | |
| "4 NaN NaN ... NaN NaN NaN NaN \n", | |
| "\n", | |
| " 41218-0.0 41218-0.1 41218-0.2 41218-0.3 41218-0.4 41252-0.0 \n", | |
| "0 NaN NaN NaN NaN NaN 14.0 \n", | |
| "1 8.0 NaN NaN NaN NaN 3.0 \n", | |
| "2 8.0 NaN NaN NaN NaN 4.0 \n", | |
| "3 NaN NaN NaN NaN NaN NaN \n", | |
| "4 8.0 NaN NaN NaN NaN 8.0 \n", | |
| "\n", | |
| "[5 rows x 1984 columns]" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "iVBORw0KGgoAAAANSUhEUgAAAs8AAAHiCAYAAAAXqCHCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3X24ZWV9H/zvT4io4GvQKW8KJmgV\nbYhMgcQmPcREB00K9qoNPE0ENddooom2tA0mtppY+mpiYs2DD0aKxhTk0RipYAwhHmlaSQQlvIiW\nAVEGEFQUGUlI0bt/rDV1ezgvN3Nm9j5zzudzXfs6e93r3mvde//2nvmede61drXWAgAArOxhsx4A\nAADsLYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzkCSpquuram7W45ilqnpxVd1aVTuq6gdn\nPR7Wjqo6vKpaVe0767EAsyU8wwZQVbdU1Y8vaDu9qv5s53Jr7ajW2vwK21nvAeItSV7TWjugtfbp\nhSvH535tVT1sou3fVNV5C/rtPwbwS/b8kPe8qnpTVb131uPYW1TVXFVtXy/7Ab6b8AysGWsglD8l\nyfUr9Dk4ySkr9PlHSe5P8vyqOmh3DGwla+C1m4mq2mfWYwA2FuEZSPLdR6er6tiqurKqvlFVd1bV\nb47dLh9/fn08svpDVfWwqnpDVX2hqu6qqvdU1WMntvvScd1Xq+pfLdjPm6rq/VX13qr6RpLTx31/\noqq+XlV3VNXbq+rhE9trVfULVXVjVd1bVW+uqu8bH/ONqrpwsv+C57joWKtqv6rakWSfJH9ZVTct\n81L9xyS/tkJYPS3JO5Jck+SfrPC6t6r6paq6uaq+UlX/acGR7ZdX1Q1V9bWq+mhVPWXBY19dVTcm\nuXFsO6qqLq2qu8fa/crEcz+zqm4aa3FhVT1hXLfzLwqnVdUXx3H86rhuS5JfSfLTY83/cmx/2Tiu\ne8exv3LB8/qXY/1ur6qfG7f//eO6/arqLeO+7qyqd1TVI5d7nSa2e15VnV1Vl1TVN5OcUFUvqqpP\nj/W/tareNNH/3VV1xnj/kJ3vn3H5+8fXqRbZzz7jGL9SVTcnedGC9Ys+/6raP8lHkhw8vl47qurg\n5d7XNXjr+J68p6quqapnLfdaLbWfntcQWKXWmpub2zq/JbklyY8vaDs9yZ8t1ifJJ5L87Hj/gCTH\nj/cPT9KS7DvxuJcn2ZbkqWPfP0jye+O6ZybZkeTvJXl4hmkR/3tiP28al0/O8Mv8I5Mck+T4JPuO\n+7shyesm9teSXJTkMUmOynCE97Jx/49N8pkkpy3xOiw51oltf/8yr2NLcmSSq5L83Nj2b5KcN9Hn\nyUm+PT73M5Jcs0JtWpKPJXnC+Nj/NbHtk8fxPmN8Pd6Q5H8ueOyl42MfmeTRSe4Y9/uIcfm4se/r\nklyR5NAk+yX5/5Kcv6Cu7xy38wPj6/qMiTq9d8G4X5Tk+5JUkr+f5L4kzxnXbUnypbE+j0rye5Ov\nbZLfGmv4hHGM/y3Jv+t8L5+X5J4kzx3fM49IMpfk2ePy30lyZ5KTJ2r+38b7/0+Sm5K8b2Ldh5bY\nz6uSfDbJYeM4P5aJ9/4Kz38uyfYF21vyfZ3kBRneU48bt/eMJAet9Fotth83N7c9f5v5ANzc3Pb8\nLUMw3pHk6xO3+7J0eL48ya8lOXDBdnaGrMnwfFmSX5hYfnqGQLxvkn+dMaCN6x6V5G/y3eH58hXG\n/rokH5xYbkmeO7F8VZJfnlj+jSS/tcS2lhzrxLZXCs/fn+SFSb6YIYQuDM9vSHL1eP/gJN9K8oMr\nbHPLxPIvJLlsvP+RJK+YWPewsW5PmXjsj02sPzXJp5fYzw1JnjexfNBEnXbW9dCJ9X+R5JSJOr13\nqecw9vnDJK8d75+biTA8vmY7X7tK8s0k3zex/oeSfL7zvXxekves0Oe3krx1vP99Gd7vD8vw14BX\nZgycSd6d5J8tsY0/TfKqieXnZ8F7f5nnP5cVQu3k+zrJj2X4pen4JA+b6LPsa9WzHzc3t91/M20D\nNo6TW2uP23nLENKW8ookT0vy2ar6ZFX95DJ9D07yhYnlL2QIZJvGdbfuXNFauy/JVxc8/tbJhap6\nWlV9uKq+VMNUjn+b5MAFj7lz4v5fLbJ8wC6MtVtr7ZIM4XnrIqtfmuT3x363J/l4hmkcy5l8Db4w\njjMZ5mD/9vin/q8nuTtDoDpkicceluHI6mKekuSDE9u6IUOwn3zuX5q4f1+Wfh1TVSdW1RXjtIev\nZ/iFYmedvqvuC+4/McMvUVdNjOWPxvZeC98zx1XVx6rqy1V1T4ajxgcmSWvtpgy/OB6d5EeSfDjJ\n7VX19AxHjD++xD4WPofJ981Kz/9Blntft9b+NMnbk/xOkjur6pyqekx2z2sF7GbCM/AgrbUbW2un\nJnlSkv+Q5P3jHMu2SPfbMwSznZ6c5IEMgfaODNMEkiTjvNbvXbi7BctnZ/hz+ZGttcdkmG/7oDmp\nu2i5sT5Ub0jyqxnCTZKkqn44w7SO148h6UtJjktyai0/R/qwBWO6fbx/a5JXTv7S01p7ZGvtf070\nn3z9bs1wpHUxtyY5ccG2HtFau63juX5XjapqvyQfyDANZ9P4y9gl+U6dvqvuC57fVzL8gnPUxDge\n21pbMqivNJ4k/zXD1IbDWmuPzXCEefI98/EMJ3E+fHy+H8/wS87jk1y9xD7uyIPrkqTr+S/2OVn2\nfd1ae1tr7ZgMU12eluRfZOXXarH9AHuY8Aw8SFX9TFU9sbX27Qx/8k6Go5RfzjCf96kT3c9P8k+r\n6oiqOiDDEbX3tdYeSPL+JD9VVT88nhz1a1k5CD86yTeS7Kiqv53k53fbE1t+rA9JGy7rd22++6jy\naRnmID8zw5HOo5M8K0PAPnGZzf2Lqnp8VR2W5LVJ3je2vyNDED8qSWo4ufEly2znw0n+VlW9bjzR\n7NFVddzEts6q8YTDqnpiVZ3U+XTvTHJ4fedExodnmLLy5SQPVNWJGaY17HRhkpdV1TOq6lEZpu8k\nScb31DuTvLWqnjSO5ZCqesHOPuNJfXOdY0uG98zdrbW/rqpjM8xtnvTxJK/Jd054nU/yixmmLX1r\niW1emOSXqurQqnp8kjMn1q30/O9M8r01ceJslnlfV9XfHY+ef0+GaRp/neRbHa/VYvsB9jDhGVjM\nliTX13AFit/OMPf1r8dpF2cl+R/jn5GPzzC/9fcyBJPPZ/iP/xeTpLV2/Xj/ggxH8u5NcleGk9GW\n8s8zhJ97MwSH9y3T96Facqy76A0ZTuRKVT0iyT9O8p9ba1+auH1+3OdyUzc+lGHu9tVJLk7yriRp\nrX0ww5H/C8Y/9V+XZUJ4a+3eJD+R5KcyTMG4MckJ4+rfznB09o+r6t4MJw8et9h2FvH/jz+/WlWf\nGvfzSxkC5tcy1OuiiXF8JMnbMpxkty3DCajJd+r+y2P7FePz+pMM889TVYdmmGZxbefYkmEK0q+P\nz+tfj+Oa9PEM4XVneP6zDL/QXJ6lvTPJR5P8ZZJPZTi5dOfzW+n5fzbDL2o3j5+Tg7P8+/oxY9vX\nMkwP+WqGo9rJMq/VEvsB9rBqzV99gOkYj/Z+PcOfrj8/6/GsBVXVMrwe22Y9lj2lqp6RIfjvt9JR\n/qr6mQzTFF4/lcEBPETCM7BHVdVPZbjKRWW4EsZxGS7p5R+frN/wXFUvznAUff8MV7X4dmvt5NmO\nCmD1TNsA9rSTMpwAd3uGk+lOEZw3hFdmmBN8U4b58rtz7jrAzDjyDAAAnRx5BgCATsIzAAB0Wu6i\n/WvCgQce2A4//PCp7/eb3/xm9t9//6nvl+lS541Bndc/Nd4Y1HljmFWdr7rqqq+01lb8Bs81H54P\nP/zwXHnllVPf7/z8fObm5qa+X6ZLnTcGdV7/1HhjUOeNYVZ1rqov9PQzbQMAADoJzwAA0El4BgCA\nTsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AM\nAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBO+856AAAALO7wMy+e9RCm7rwt+896CMta8chz\nVZ1bVXdV1XUTbe+rqqvH2y1VdfXYfnhV/dXEundMPOaYqrq2qrZV1duqqvbMUwIAgD2j58jzeUne\nnuQ9Oxtaaz+9835V/UaSeyb639RaO3qR7ZydZGuSK5JckmRLko889CEDAMBsrHjkubV2eZK7F1s3\nHj3+x0nOX24bVXVQkse01j7RWmsZgvjJD324AAAwO6s9YfBHktzZWrtxou2Iqvp0VX28qn5kbDsk\nyfaJPtvHNgAA2Gus9oTBU/PdR53vSPLk1tpXq+qYJH9YVUclWWx+c1tqo1W1NcMUj2zatCnz8/Or\nHOZDt2PHjpnsl+lS541Bndc/Nd4YNmKdz3j2A7MewtSt9Trvcniuqn2T/MMkx+xsa63dn+T+8f5V\nVXVTkqdlONJ86MTDD01y+1Lbbq2dk+ScJNm8eXObm5vb1WHusvn5+cxiv0yXOm8M6rz+qfHGsBHr\nfPoGvdrGWq7zaqZt/HiSz7bW/u90jKp6YlXtM95/apIjk9zcWrsjyb1Vdfw4T/qlST60in0DAMDU\n9Vyq7vwkn0jy9KraXlWvGFedkgefKPijSa6pqr9M8v4kr2qt7TzZ8OeT/G6SbUluiittAACwl1lx\n2kZr7dQl2k9fpO0DST6wRP8rkzzrIY4PAADWDF/PDQAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4\nBgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCg\nk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwD\nAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJ\neAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEA\noJPwDAAAnYRnAADotGJ4rqpzq+quqrpuou1NVXVbVV093l44se71VbWtqj5XVS+YaN8ytm2rqjN3\n/1MBAIA9q+fI83lJtizS/tbW2tHj7ZIkqapnJjklyVHjY/7fqtqnqvZJ8jtJTkzyzCSnjn0BAGCv\nse9KHVprl1fV4Z3bOynJBa21+5N8vqq2JTl2XLettXZzklTVBWPfzzzkEQMAwIysGJ6X8ZqqemmS\nK5Oc0Vr7WpJDklwx0Wf72JYkty5oP26pDVfV1iRbk2TTpk2Zn59fxTB3zY4dO2ayX6ZLnTcGdV7/\n1Hhj2Ih1PuPZD8x6CFO31uu8q+H57CRvTtLGn7+R5OVJapG+LYtPD2lLbby1dk6Sc5Jk8+bNbW5u\nbheHuevm5+czi/0yXeq8Majz+qfGG8NGrPPpZ1486yFM3Xlb9l/Tdd6l8Nxau3Pn/ap6Z5IPj4vb\nkxw20fXQJLeP95dqBwCAvcIuXaquqg6aWHxxkp1X4rgoySlVtV9VHZHkyCR/keSTSY6sqiOq6uEZ\nTiq8aNeHDQAA07fikeeqOj/JXJIDq2p7kjcmmauqozNMvbglySuTpLV2fVVdmOFEwAeSvLq19q1x\nO69J8tEk+yQ5t7V2/W5/NgAAsAf1XG3j1EWa37VM/7OSnLVI+yVJLnlIowMAgDXENwwCAEAn4RkA\nADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7C\nMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAA\nnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZ\nAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBO\nwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAECnFcNzVZ1bVXdV1XUTbf+pqj5bVddU1Qer6nFj++FV\n9VdVdfV4e8fEY46pqmuraltVva2qas88JQAA2DN6jjyfl2TLgrZLkzyrtfZ3kvyvJK+fWHdTa+3o\n8faqifazk2xNcuR4W7hNAABY01YMz621y5PcvaDtj1trD4yLVyQ5dLltVNVBSR7TWvtEa60leU+S\nk3dtyAAAMBu7Y87zy5N8ZGL5iKr6dFV9vKp+ZGw7JMn2iT7bxzYAANhr7LuaB1fVryZ5IMnvj013\nJHlya+2rVXVMkj+sqqOSLDa/uS2z3a0Zpnhk06ZNmZ+fX80wd8mOHTtmsl+mS503BnVe/9R4Y9iI\ndT7j2Q+s3GmdWet13uXwXFWnJfnJJM8bp2KktXZ/kvvH+1dV1U1JnpbhSPPk1I5Dk9y+1LZba+ck\nOSdJNm/e3Obm5nZ1mLtsfn4+s9gv06XOG4M6r39qvDFsxDqffubFsx7C1J23Zf81XeddmrZRVVuS\n/HKSf9Bau2+i/YlVtc94/6kZTgy8ubV2R5J7q+r48SobL03yoVWPHgAApmjFI89VdX6SuSQHVtX2\nJG/McHWN/ZJcOl5x7orxyho/muTXq+qBJN9K8qrW2s6TDX8+w5U7HplhjvTkPGkAAFjzVgzPrbVT\nF2l+1xJ9P5DkA0usuzLJsx7S6AAAYA3xDYMAANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgk\nPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA\n0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKe\nAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADo\nJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8A\nANCpKzxX1blVdVdVXTfR9oSqurSqbhx/Pn5sr6p6W1Vtq6prquo5E485bex/Y1WdtvufDgAA7Dm9\nR57PS7JlQduZSS5rrR2Z5LJxOUlOTHLkeNua5OxkCNtJ3pjkuCTHJnnjzsANAAB7g67w3Fq7PMnd\nC5pPSvLu8f67k5w80f6eNrgiyeOq6qAkL0hyaWvt7tba15JcmgcHcgAAWLNWM+d5U2vtjiQZfz5p\nbD8kya0T/baPbUu1AwDAXmHfPbDNWqStLdP+4A1Ubc0w5SObNm3K/Pz8bhtcrx07dsxkv0yXOm8M\n6rz+qfHGsBHrfMazH5j1EKZurdd5NeH5zqo6qLV2xzgt466xfXuSwyb6HZrk9rF9bkH7/GIbbq2d\nk+ScJNm8eXObm5tbrNseNT8/n1nsl+lS541Bndc/Nd4YNmKdTz/z4lkPYerO27L/mq7zaqZtXJRk\n5xUzTkvyoYn2l45X3Tg+yT3jtI6PJnl+VT1+PFHw+WMbAADsFbqOPFfV+RmOGh9YVdszXDXj3ye5\nsKpekeSLSV4ydr8kyQuTbEtyX5KXJUlr7e6qenOST479fr21tvAkRAAAWLO6wnNr7dQlVj1vkb4t\nyauX2M65Sc7tHh0AAKwhvmEQAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMA\nQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4\nBgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCg\nk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwD\nAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBp\nl8NzVT29qq6euH2jql5XVW+qqtsm2l848ZjXV9W2qvpcVb1g9zwFAACYjn139YGttc8lOTpJqmqf\nJLcl+WCSlyV5a2vtLZP9q+qZSU5JclSSg5P8SVU9rbX2rV0dAwAATNPumrbxvCQ3tda+sEyfk5Jc\n0Fq7v7X2+STbkhy7m/YPAAB73O4Kz6ckOX9i+TVVdU1VnVtVjx/bDkly60Sf7WMbAADsFaq1troN\nVD08ye1Jjmqt3VlVm5J8JUlL8uYkB7XWXl5Vv5PkE621946Pe1eSS1prH1hkm1uTbE2STZs2HXPB\nBResaoy7YseOHTnggAOmvl+mS503BnVe/9R4Y9iIdb72tntmPYSpO+Kx+8ykzieccMJVrbXNK/Xb\n5TnPE05M8qnW2p1JsvNnklTVO5N8eFzcnuSwiccdmiF0P0hr7Zwk5yTJ5s2b29zc3G4Y5kMzPz+f\nWeyX6VLnjUGd1z813hg2Yp1PP/PiWQ9h6s7bsv+arvPumLZxaiambFTVQRPrXpzkuvH+RUlOqar9\nquqIJEcm+YvdsH8AAJiKVR15rqpHJfmJJK+caP6PVXV0hmkbt+xc11q7vqouTPKZJA8kebUrbQAA\nsDdZVXhurd2X5HsXtP3sMv3PSnLWavYJAACz4hsGAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnP\nAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0\nEp4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcA\nAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJ\nzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAA\ndBKeAQCg06rDc1XdUlXXVtXVVXXl2PaEqrq0qm4cfz5+bK+qeltVbauqa6rqOavdPwAATMvuOvJ8\nQmvt6Nba5nH5zCSXtdaOTHLZuJwkJyY5crxtTXL2bto/AADscXtq2sZJSd493n93kpMn2t/TBlck\neVxVHbSHxgAAALvV7gjPLckfV9VVVbV1bNvUWrsjScafTxrbD0ly68Rjt49tAACw5u27G7bx3Nba\n7VX1pCSXVtVnl+lbi7S1B3UaQvjWJNm0aVPm5+d3wzAfmh07dsxkv0yXOm8M6rz+qfHGsBHrfMaz\nH5j1EKZurdd51eG5tXb7+POuqvpgkmOT3FlVB7XW7hinZdw1dt+e5LCJhx+a5PZFtnlOknOSZPPm\nzW1ubm61w3zI5ufnM4v9Ml3qvDGo8/qnxhvDRqzz6WdePOshTN15W/Zf03Ve1bSNqtq/qh69836S\n5ye5LslFSU4bu52W5EPj/YuSvHS86sbxSe7ZOb0DAADWutUeed6U5INVtXNb/7W19kdV9ckkF1bV\nK5J8MclLxv6XJHlhkm1J7kvyslXuHwAApmZV4bm1dnOSH1ik/atJnrdIe0vy6tXsEwAAZsU3DAIA\nQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4\nBgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCg\nk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwD\nAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJ\neAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOu1yeK6qw6rqY1V1Q1VdX1WvHdvf\nVFW3VdXV4+2FE495fVVtq6rPVdULdscTAACAadl3FY99IMkZrbVPVdWjk1xVVZeO697aWnvLZOeq\nemaSU5IcleTgJH9SVU9rrX1rFWMAAICp2eUjz621O1prnxrv35vkhiSHLPOQk5Jc0Fq7v7X2+STb\nkhy7q/sHAIBpq9ba6jdSdXiSy5M8K8k/S3J6km8kuTLD0emvVdXbk1zRWnvv+Jh3JflIa+39i2xv\na5KtSbJp06ZjLrjgglWP8aHasWNHDjjggKnvl+lS541Bndc/Nd4YNmKdr73tnlkPYeqOeOw+M6nz\nCSeccFVrbfNK/VYzbSNJUlUHJPlAkte11r5RVWcneXOSNv78jSQvT1KLPHzR5N5aOyfJOUmyefPm\nNjc3t9phPmTz8/OZxX6ZLnXeGNR5/VPjjWEj1vn0My+e9RCm7rwt+6/pOq/qahtV9T0ZgvPvt9b+\nIElaa3e21r7VWvt2knfmO1Mztic5bOLhhya5fTX7BwCAaVrN1TYqybuS3NBa+82J9oMmur04yXXj\n/YuSnFJV+1XVEUmOTPIXu7p/AACYttVM23hukp9Ncm1VXT22/UqSU6vq6AxTMm5J8sokaa1dX1UX\nJvlMhit1vNqVNgAA2Jvscnhurf1ZFp/HfMkyjzkryVm7uk8AAJgl3zAIAACdhGcAAOgkPAMAQCfh\nGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCA\nTsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AM\nAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADotO+sB7BW\nXXvbPTn9zItnPYypuuXfv2jWQwAAWNMceQYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfh\nGQAAOgnPAADQSXgGAIBOvmEPMrVwAAAFWklEQVQQgL2eb4XdGDZinVl7HHkGAIBOwjMAAHQSngEA\noJM5zwCwFzp8A879PePZsx4BzODIc1VtqarPVdW2qjpz2vsHAIBdNdUjz1W1T5LfSfITSbYn+WRV\nXdRa+8w0xwE7bcQzt52hv/5txBoDTMu0p20cm2Rba+3mJKmqC5KclER4BthN/DkfYM+Zdng+JMmt\nE8vbkxw35TGwBP/hbgzqDAC7rlpr09tZ1UuSvKC19nPj8s8mOba19osL+m1NsnVcfHqSz01tkN9x\nYJKvzGC/TJc6bwzqvP6p8cagzhvDrOr8lNbaE1fqNO0jz9uTHDaxfGiS2xd2aq2dk+ScaQ1qMVV1\nZWtt8yzHwJ6nzhuDOq9/arwxqPPGsNbrPO2rbXwyyZFVdURVPTzJKUkumvIYAABgl0z1yHNr7YGq\nek2SjybZJ8m5rbXrpzkGAADYVVP/kpTW2iVJLpn2fnfBTKeNMDXqvDGo8/qnxhuDOm8Ma7rOUz1h\nEAAA9mZT/4ZBAADYWwnPi/AV4utPVR1WVR+rqhuq6vqqeu3Y/oSqurSqbhx/Pn7WY2X1qmqfqvp0\nVX14XD6iqv58rPP7xhOW2YtV1eOq6v1V9dnxc/1DPs/rS1X90/Hf6+uq6vyqeoTP8t6vqs6tqruq\n6rqJtkU/uzV425jHrqmq58xu5N8hPC8w8RXiJyZ5ZpJTq+qZsx0Vu8EDSc5orT0jyfFJXj3W9cwk\nl7XWjkxy2bjM3u+1SW6YWP4PSd461vlrSV4xk1GxO/12kj9qrf3tJD+Qod4+z+tEVR2S5JeSbG6t\nPSvDRQZOic/yenBeki0L2pb67J6Y5MjxtjXJ2VMa47KE5wf7v18h3lr7myQ7v0KcvVhr7Y7W2qfG\n+/dm+I/2kAy1fffY7d1JTp7NCNldqurQJC9K8rvjciX5sSTvH7uo816uqh6T5EeTvCtJWmt/01r7\nenye15t9kzyyqvZN8qgkd8Rnea/XWrs8yd0Lmpf67J6U5D1tcEWSx1XVQdMZ6dKE5wdb7CvED5nR\nWNgDqurwJD+Y5M+TbGqt3ZEMATvJk2Y3MnaT30ryL5N8e1z+3iRfb609MC77TO/9nprky0n+yzg9\n53erav/4PK8brbXbkrwlyRczhOZ7klwVn+X1aqnP7prMZMLzg9UibS5Jsk5U1QFJPpDkda21b8x6\nPOxeVfWTSe5qrV012bxIV5/pvdu+SZ6T5OzW2g8m+WZM0VhXxjmvJyU5IsnBSfbP8Cf8hXyW17c1\n+e+38PxgXV8hzt6nqr4nQ3D+/dbaH4zNd+78E9D4865ZjY/d4rlJ/kFV3ZJhytWPZTgS/bjxT7+J\nz/R6sD3J9tban4/L788Qpn2e148fT/L51tqXW2v/O8kfJPnh+CyvV0t9dtdkJhOeH8xXiK9D47zX\ndyW5obX2mxOrLkpy2nj/tCQfmvbY2H1aa69vrR3aWjs8w2f3T1tr/yTJx5L8o7GbOu/lWmtfSnJr\nVT19bHpeks/E53k9+WKS46vqUeO/3ztr7LO8Pi312b0oyUvHq24cn+SendM7ZsmXpCyiql6Y4WjV\nzq8QP2vGQ2KVqurvJfnvSa7Nd+bC/kqGec8XJnlyhn+sX9JaW3giA3uhqppL8s9baz9ZVU/NcCT6\nCUk+neRnWmv3z3J8rE5VHZ3hpNCHJ7k5ycsyHBDyeV4nqurXkvx0hqslfTrJz2WY7+qzvBerqvOT\nzCU5MMmdSd6Y5A+zyGd3/MXp7RmuznFfkpe11q6cxbgnCc8AANDJtA0AAOgkPAMAQCfhGQAAOgnP\nAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnf4PH60lIu0cWjgAAAAASUVORK5CYII=\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x7fb94661d908>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "plt.figure(figsize=(12,8))\n", | |
| "plt.title('Histogram of NA percentage, raw dataset')\n", | |
| "(df.isna().sum() / len(df) * 100).hist();" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 1st pattern: all patients have not done all three visits." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>Column</th>\n", | |
| " <th>UDI</th>\n", | |
| " <th>Count</th>\n", | |
| " <th>Type</th>\n", | |
| " <th>Description</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>0</td>\n", | |
| " <td>eid</td>\n", | |
| " <td>502628</td>\n", | |
| " <td>Sequence</td>\n", | |
| " <td>Encoded anonymised participant ID</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>31-0.0</td>\n", | |
| " <td>502628</td>\n", | |
| " <td>Categorical (single)</td>\n", | |
| " <td>SexUses data-coding 9</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2</td>\n", | |
| " <td>34-0.0</td>\n", | |
| " <td>502628</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Year of birth</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>3</td>\n", | |
| " <td>52-0.0</td>\n", | |
| " <td>502628</td>\n", | |
| " <td>Categorical (single)</td>\n", | |
| " <td>Month of birthUses data-coding 8</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>4</td>\n", | |
| " <td>53-0.0</td>\n", | |
| " <td>502628</td>\n", | |
| " <td>Date</td>\n", | |
| " <td>Date of attending assessment centre</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>5</td>\n", | |
| " <td>53-1.0</td>\n", | |
| " <td>20348</td>\n", | |
| " <td>Date</td>\n", | |
| " <td>Date of attending assessment centre</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>6</td>\n", | |
| " <td>53-2.0</td>\n", | |
| " <td>13268</td>\n", | |
| " <td>Date</td>\n", | |
| " <td>Date of attending assessment centre</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>7</td>\n", | |
| " <td>54-0.0</td>\n", | |
| " <td>502628</td>\n", | |
| " <td>Categorical (single)</td>\n", | |
| " <td>UK Biobank assessment centreUses data-coding 10</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>8</td>\n", | |
| " <td>54-1.0</td>\n", | |
| " <td>20346</td>\n", | |
| " <td>Categorical (single)</td>\n", | |
| " <td>UK Biobank assessment centreUses data-coding 10</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>9</td>\n", | |
| " <td>54-2.0</td>\n", | |
| " <td>13232</td>\n", | |
| " <td>Categorical (single)</td>\n", | |
| " <td>UK Biobank assessment centreUses data-coding 10</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>10</td>\n", | |
| " <td>189-0.0</td>\n", | |
| " <td>502001</td>\n", | |
| " <td>Continuous</td>\n", | |
| " <td>Townsend deprivation index at recruitment</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11</th>\n", | |
| " <td>11</td>\n", | |
| " <td>398-0.1</td>\n", | |
| " <td>497998</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of correct matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>12</th>\n", | |
| " <td>12</td>\n", | |
| " <td>398-0.2</td>\n", | |
| " <td>497998</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of correct matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>13</th>\n", | |
| " <td>13</td>\n", | |
| " <td>398-0.3</td>\n", | |
| " <td>0</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of correct matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>14</th>\n", | |
| " <td>14</td>\n", | |
| " <td>398-1.1</td>\n", | |
| " <td>20339</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of correct matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>15</th>\n", | |
| " <td>15</td>\n", | |
| " <td>398-1.2</td>\n", | |
| " <td>20339</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of correct matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>16</th>\n", | |
| " <td>16</td>\n", | |
| " <td>398-1.3</td>\n", | |
| " <td>0</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of correct matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>17</th>\n", | |
| " <td>17</td>\n", | |
| " <td>398-2.1</td>\n", | |
| " <td>11670</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of correct matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>18</th>\n", | |
| " <td>18</td>\n", | |
| " <td>398-2.2</td>\n", | |
| " <td>11670</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of correct matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>19</th>\n", | |
| " <td>19</td>\n", | |
| " <td>398-2.3</td>\n", | |
| " <td>10512</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of correct matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>20</th>\n", | |
| " <td>20</td>\n", | |
| " <td>399-0.1</td>\n", | |
| " <td>497998</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of incorrect matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>21</th>\n", | |
| " <td>21</td>\n", | |
| " <td>399-0.2</td>\n", | |
| " <td>497998</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of incorrect matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>22</th>\n", | |
| " <td>22</td>\n", | |
| " <td>399-0.3</td>\n", | |
| " <td>0</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of incorrect matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>23</th>\n", | |
| " <td>23</td>\n", | |
| " <td>399-1.1</td>\n", | |
| " <td>20339</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of incorrect matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>24</th>\n", | |
| " <td>24</td>\n", | |
| " <td>399-1.2</td>\n", | |
| " <td>20339</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of incorrect matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>25</th>\n", | |
| " <td>25</td>\n", | |
| " <td>399-1.3</td>\n", | |
| " <td>0</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of incorrect matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>26</th>\n", | |
| " <td>26</td>\n", | |
| " <td>399-2.1</td>\n", | |
| " <td>11670</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of incorrect matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>27</th>\n", | |
| " <td>27</td>\n", | |
| " <td>399-2.2</td>\n", | |
| " <td>11670</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of incorrect matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>28</th>\n", | |
| " <td>28</td>\n", | |
| " <td>399-2.3</td>\n", | |
| " <td>10512</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Number of incorrect matches in round</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>29</th>\n", | |
| " <td>29</td>\n", | |
| " <td>400-0.1</td>\n", | |
| " <td>497998</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Time to complete roundUses data-coding 402</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1954</th>\n", | |
| " <td>1954</td>\n", | |
| " <td>41204-0.421</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1955</th>\n", | |
| " <td>1955</td>\n", | |
| " <td>41204-0.422</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1956</th>\n", | |
| " <td>1956</td>\n", | |
| " <td>41204-0.423</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1957</th>\n", | |
| " <td>1957</td>\n", | |
| " <td>41204-0.424</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1958</th>\n", | |
| " <td>1958</td>\n", | |
| " <td>41204-0.425</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1959</th>\n", | |
| " <td>1959</td>\n", | |
| " <td>41204-0.426</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1960</th>\n", | |
| " <td>1960</td>\n", | |
| " <td>41204-0.427</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1961</th>\n", | |
| " <td>1961</td>\n", | |
| " <td>41204-0.428</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1962</th>\n", | |
| " <td>1962</td>\n", | |
| " <td>41204-0.429</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1963</th>\n", | |
| " <td>1963</td>\n", | |
| " <td>41204-0.430</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1964</th>\n", | |
| " <td>1964</td>\n", | |
| " <td>41204-0.431</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1965</th>\n", | |
| " <td>1965</td>\n", | |
| " <td>41204-0.432</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1966</th>\n", | |
| " <td>1966</td>\n", | |
| " <td>41204-0.433</td>\n", | |
| " <td>1</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1967</th>\n", | |
| " <td>1967</td>\n", | |
| " <td>41204-0.434</td>\n", | |
| " <td>1</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Diagnoses - secondary ICD10Uses data-coding 19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1968</th>\n", | |
| " <td>1968</td>\n", | |
| " <td>41214-0.0</td>\n", | |
| " <td>348719</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Carer support indicatorsUses data-coding 227</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1969</th>\n", | |
| " <td>1969</td>\n", | |
| " <td>41214-0.1</td>\n", | |
| " <td>71128</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Carer support indicatorsUses data-coding 227</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1970</th>\n", | |
| " <td>1970</td>\n", | |
| " <td>41214-0.2</td>\n", | |
| " <td>2972</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Carer support indicatorsUses data-coding 227</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1971</th>\n", | |
| " <td>1971</td>\n", | |
| " <td>41215-0.0</td>\n", | |
| " <td>42696</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Detention categoriesUses data-coding 230</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1972</th>\n", | |
| " <td>1972</td>\n", | |
| " <td>41215-0.1</td>\n", | |
| " <td>499</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Detention categoriesUses data-coding 230</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1973</th>\n", | |
| " <td>1973</td>\n", | |
| " <td>41216-0.0</td>\n", | |
| " <td>38</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Legal statusesUses data-coding 231</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1974</th>\n", | |
| " <td>1974</td>\n", | |
| " <td>41216-0.1</td>\n", | |
| " <td>4</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Legal statusesUses data-coding 231</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1975</th>\n", | |
| " <td>1975</td>\n", | |
| " <td>41217-0.0</td>\n", | |
| " <td>59</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Mental categoriesUses data-coding 228</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1976</th>\n", | |
| " <td>1976</td>\n", | |
| " <td>41217-0.1</td>\n", | |
| " <td>5</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Mental categoriesUses data-coding 228</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1977</th>\n", | |
| " <td>1977</td>\n", | |
| " <td>41217-0.2</td>\n", | |
| " <td>1</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>Mental categoriesUses data-coding 228</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1978</th>\n", | |
| " <td>1978</td>\n", | |
| " <td>41218-0.0</td>\n", | |
| " <td>348717</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>History of psychiatric care on admissionUses d...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1979</th>\n", | |
| " <td>1979</td>\n", | |
| " <td>41218-0.1</td>\n", | |
| " <td>46286</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>History of psychiatric care on admissionUses d...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1980</th>\n", | |
| " <td>1980</td>\n", | |
| " <td>41218-0.2</td>\n", | |
| " <td>1799</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>History of psychiatric care on admissionUses d...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1981</th>\n", | |
| " <td>1981</td>\n", | |
| " <td>41218-0.3</td>\n", | |
| " <td>290</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>History of psychiatric care on admissionUses d...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1982</th>\n", | |
| " <td>1982</td>\n", | |
| " <td>41218-0.4</td>\n", | |
| " <td>14</td>\n", | |
| " <td>Categorical (multiple)</td>\n", | |
| " <td>History of psychiatric care on admissionUses d...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1983</th>\n", | |
| " <td>1983</td>\n", | |
| " <td>41252-0.0</td>\n", | |
| " <td>395945</td>\n", | |
| " <td>Integer</td>\n", | |
| " <td>Episodes containing \"Source of inpatient recor...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>1984 rows × 5 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " Column UDI Count Type \\\n", | |
| "0 0 eid 502628 Sequence \n", | |
| "1 1 31-0.0 502628 Categorical (single) \n", | |
| "2 2 34-0.0 502628 Integer \n", | |
| "3 3 52-0.0 502628 Categorical (single) \n", | |
| "4 4 53-0.0 502628 Date \n", | |
| "5 5 53-1.0 20348 Date \n", | |
| "6 6 53-2.0 13268 Date \n", | |
| "7 7 54-0.0 502628 Categorical (single) \n", | |
| "8 8 54-1.0 20346 Categorical (single) \n", | |
| "9 9 54-2.0 13232 Categorical (single) \n", | |
| "10 10 189-0.0 502001 Continuous \n", | |
| "11 11 398-0.1 497998 Integer \n", | |
| "12 12 398-0.2 497998 Integer \n", | |
| "13 13 398-0.3 0 Integer \n", | |
| "14 14 398-1.1 20339 Integer \n", | |
| "15 15 398-1.2 20339 Integer \n", | |
| "16 16 398-1.3 0 Integer \n", | |
| "17 17 398-2.1 11670 Integer \n", | |
| "18 18 398-2.2 11670 Integer \n", | |
| "19 19 398-2.3 10512 Integer \n", | |
| "20 20 399-0.1 497998 Integer \n", | |
| "21 21 399-0.2 497998 Integer \n", | |
| "22 22 399-0.3 0 Integer \n", | |
| "23 23 399-1.1 20339 Integer \n", | |
| "24 24 399-1.2 20339 Integer \n", | |
| "25 25 399-1.3 0 Integer \n", | |
| "26 26 399-2.1 11670 Integer \n", | |
| "27 27 399-2.2 11670 Integer \n", | |
| "28 28 399-2.3 10512 Integer \n", | |
| "29 29 400-0.1 497998 Integer \n", | |
| "... ... ... ... ... \n", | |
| "1954 1954 41204-0.421 2 Categorical (multiple) \n", | |
| "1955 1955 41204-0.422 2 Categorical (multiple) \n", | |
| "1956 1956 41204-0.423 2 Categorical (multiple) \n", | |
| "1957 1957 41204-0.424 2 Categorical (multiple) \n", | |
| "1958 1958 41204-0.425 2 Categorical (multiple) \n", | |
| "1959 1959 41204-0.426 2 Categorical (multiple) \n", | |
| "1960 1960 41204-0.427 2 Categorical (multiple) \n", | |
| "1961 1961 41204-0.428 2 Categorical (multiple) \n", | |
| "1962 1962 41204-0.429 2 Categorical (multiple) \n", | |
| "1963 1963 41204-0.430 2 Categorical (multiple) \n", | |
| "1964 1964 41204-0.431 2 Categorical (multiple) \n", | |
| "1965 1965 41204-0.432 2 Categorical (multiple) \n", | |
| "1966 1966 41204-0.433 1 Categorical (multiple) \n", | |
| "1967 1967 41204-0.434 1 Categorical (multiple) \n", | |
| "1968 1968 41214-0.0 348719 Categorical (multiple) \n", | |
| "1969 1969 41214-0.1 71128 Categorical (multiple) \n", | |
| "1970 1970 41214-0.2 2972 Categorical (multiple) \n", | |
| "1971 1971 41215-0.0 42696 Categorical (multiple) \n", | |
| "1972 1972 41215-0.1 499 Categorical (multiple) \n", | |
| "1973 1973 41216-0.0 38 Categorical (multiple) \n", | |
| "1974 1974 41216-0.1 4 Categorical (multiple) \n", | |
| "1975 1975 41217-0.0 59 Categorical (multiple) \n", | |
| "1976 1976 41217-0.1 5 Categorical (multiple) \n", | |
| "1977 1977 41217-0.2 1 Categorical (multiple) \n", | |
| "1978 1978 41218-0.0 348717 Categorical (multiple) \n", | |
| "1979 1979 41218-0.1 46286 Categorical (multiple) \n", | |
| "1980 1980 41218-0.2 1799 Categorical (multiple) \n", | |
| "1981 1981 41218-0.3 290 Categorical (multiple) \n", | |
| "1982 1982 41218-0.4 14 Categorical (multiple) \n", | |
| "1983 1983 41252-0.0 395945 Integer \n", | |
| "\n", | |
| " Description \n", | |
| "0 Encoded anonymised participant ID \n", | |
| "1 SexUses data-coding 9 \n", | |
| "2 Year of birth \n", | |
| "3 Month of birthUses data-coding 8 \n", | |
| "4 Date of attending assessment centre \n", | |
| "5 Date of attending assessment centre \n", | |
| "6 Date of attending assessment centre \n", | |
| "7 UK Biobank assessment centreUses data-coding 10 \n", | |
| "8 UK Biobank assessment centreUses data-coding 10 \n", | |
| "9 UK Biobank assessment centreUses data-coding 10 \n", | |
| "10 Townsend deprivation index at recruitment \n", | |
| "11 Number of correct matches in round \n", | |
| "12 Number of correct matches in round \n", | |
| "13 Number of correct matches in round \n", | |
| "14 Number of correct matches in round \n", | |
| "15 Number of correct matches in round \n", | |
| "16 Number of correct matches in round \n", | |
| "17 Number of correct matches in round \n", | |
| "18 Number of correct matches in round \n", | |
| "19 Number of correct matches in round \n", | |
| "20 Number of incorrect matches in round \n", | |
| "21 Number of incorrect matches in round \n", | |
| "22 Number of incorrect matches in round \n", | |
| "23 Number of incorrect matches in round \n", | |
| "24 Number of incorrect matches in round \n", | |
| "25 Number of incorrect matches in round \n", | |
| "26 Number of incorrect matches in round \n", | |
| "27 Number of incorrect matches in round \n", | |
| "28 Number of incorrect matches in round \n", | |
| "29 Time to complete roundUses data-coding 402 \n", | |
| "... ... \n", | |
| "1954 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1955 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1956 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1957 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1958 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1959 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1960 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1961 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1962 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1963 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1964 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1965 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1966 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1967 Diagnoses - secondary ICD10Uses data-coding 19 \n", | |
| "1968 Carer support indicatorsUses data-coding 227 \n", | |
| "1969 Carer support indicatorsUses data-coding 227 \n", | |
| "1970 Carer support indicatorsUses data-coding 227 \n", | |
| "1971 Detention categoriesUses data-coding 230 \n", | |
| "1972 Detention categoriesUses data-coding 230 \n", | |
| "1973 Legal statusesUses data-coding 231 \n", | |
| "1974 Legal statusesUses data-coding 231 \n", | |
| "1975 Mental categoriesUses data-coding 228 \n", | |
| "1976 Mental categoriesUses data-coding 228 \n", | |
| "1977 Mental categoriesUses data-coding 228 \n", | |
| "1978 History of psychiatric care on admissionUses d... \n", | |
| "1979 History of psychiatric care on admissionUses d... \n", | |
| "1980 History of psychiatric care on admissionUses d... \n", | |
| "1981 History of psychiatric care on admissionUses d... \n", | |
| "1982 History of psychiatric care on admissionUses d... \n", | |
| "1983 Episodes containing \"Source of inpatient recor... \n", | |
| "\n", | |
| "[1984 rows x 5 columns]" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "table = pd.read_html(DATA + \"ukb9543.html\", match='UDI', header=0)[0]\n", | |
| "table['Type'] = table['Type'].ffill()\n", | |
| "table['Description'] = table['Description'].ffill()\n", | |
| "table" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "iVBORw0KGgoAAAANSUhEUgAAAs8AAAHiCAYAAAAXqCHCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3XuUZVV9J/DvL3TwAVFAtEcBbYzE\n94rBHiUmk2nFUVATyFowwTGxMTi4JsZ3oq0xMU40ozNGg2NGh4gBEwc0aASFRBm0YpwEVkCjiJjQ\nvqB5qjykRaPEPX+c0+FSVHdtqpp7u6s+n7Xuqnv23fecfe+vDnx71z73VmstAADA4n5k1gMAAIDd\nhfAMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGulXVpVW1YdbjmKWq+sWqurKqtlbVT816PNxZ\nVd2rqj5SVTdX1Z9X1XOq6uMzGsui50xVPXj8fdpjSsMClkF4BpIkVfW1qnrqvLbjq+rT27Zba49u\nrc0tsp91VdWqas3dNNRZe0uSX2+t7d1a++z8B8fXfklV/chE2xuq6tR5/fYaA9O5d/+Q735V9btV\n9WezHsfomCRrk9yvtXZsa+19rbWnLWVHVXVqVb1hqQPpOWdaa1eMv0//Mh5zrqqev9RjAncv4RnY\nrewCofwhSS5dpM+Dkhy3SJ9jkvxzkqdV1QN3xsAWswu8d9PykCT/1Fq7bbGOq+g9AXYS4RnoNjk7\nXVVPqKqLqurbVXVdVb117Pap8edN48zqT1fVj1TVa6vq61V1fVW9t6ruO7Hf546PfauqfnvecX63\nqs6sqj+rqm8nOX489t9V1U1VdU1VvaOq9pzYX6uqX6uqy6vqlqr6var68fE5366qD0z2n/caFxxr\nVd2jqrYm2SPJ56rqyzt4q/57ktcvEsw2JnlXks8nec4i73urqhdX1Veq6ptV9T/mzWz/alVdVlU3\nVtXHquoh8577wqq6PMnlY9ujq+q8qrphrN1rJl77pqr68liLD1TVfuNj2/6isLGqrhjH8VvjY0ck\neU2SXxpr/rmx/XnjuG4Zx/6Cea/rlWP9rq6q54/7f9j42D2q6i3jsa6rqndV1b129D6Nz3t9kt+Z\nGMsJNe8vKPPfkxq8baz3zVX1+ap6TFWdONbmleO+PrLA8d5VVW+Z13ZWVb18vL/oOTPx3q6pqjcm\n+XdJ3jEe8x2LvWZgylprbm5ubknytSRPndd2fJJPL9Qnyd8l+ZXx/t5JDhvvr0vSkqyZeN6vJtmc\n5KFj3w8l+dPxsUcl2ZrkZ5PsmWFZxA8mjvO74/bRGf7Bf68kj09yWJI14/EuS/LSieO1JGcnuU+S\nR2eY4T1/PP59k3wxycbtvA/bHevEvh+2g/exJTkkycVJnj+2vSHJqRN9Hpzkh+Nrf0WSzy9Sm5bk\nk0n2G5/7TxP7Pnoc7yPH9+O1Sf523nPPG597ryQ/luSa8bj3HLefOPZ9aZILkhyY5B5J/neS0+fV\n9Y/H/fzk+L4+cqJOfzZv3M9M8uNJKsm/T3JrkkPHx45Icu1Yn3sn+dPJ9zbJH4413G8c40eS/LfO\n3+U7jCV3/j2e/548fazXPuNYH5nkgWPfU5O8YQfH+rkkVyapcXvfJN9N8qClnjNJ5rbV183Nbde7\nmXkGJn14nM29qapuSvK/dtD3B0keVlX7t9a2ttYu2EHf5yR5a2vtK621rUleneS4cWb2mCQfaa19\nurX2/Qyzhm3e8/+utfbh1toPW2vfba1d3Fq7oLV2W2vtaxlC3r+f95w3t9a+3Vq7NMkXknx8PP7N\nSf4yyfYu9tvRWHu1JL+d5Heq6h4LPP7cDIH5i0lOT/LoWvziwze31m5orV2RIVg+e2x/QYZQeVkb\nlin8fpLHTc4+j4/f0Fr7bpJnJbm2tfYHrbXvtdZuaa1dOLGv32qtbWmt/XOGEHrMvNf++rEGn0vy\nuQwheuE3obVzWmtfboO/TvLxDLOqSfIfk/xJa+3S1tqtSV6/7XlVVUn+c5KXjeO+ZXxdiy2FuSsm\n35MfZAjoj8gQgi9rrV3TuZ+/yVDvba/rmAy/r1cv0PeunDPALkp4BiYd3VrbZ9stya/toO8JSX4i\nyZeq6u+r6lk76PugJF+f2P56hlnSteNjV257YAxS35r3/CsnN6rqJ6rqo1V1bQ1LOX4/yf7znnPd\nxP3vLrC99xLG2q21dm6SK5KcuMDDz03yvrHf1Un+OsMyjh2ZfA++Po4zGdb3njTxD54bMsyeHrCd\n5x6UZHtLTh6S5C8m9nVZkn/JHV/7tRP3b83238dU1ZFVdcG4POSmJM/I7XW6Q93n3b9/htnoiyfG\n8ldj+84y+Tv3iSTvSPJHSa6rqpOr6j49O2mttSRn5PZ/zPynjLVdwF05Z4BdlPAMLElr7fLW2rOT\nPCDJm5OcWVV75c6zxklydYZgts2Dk9yWIdBek2GZQJLhY8aS3G/+4eZtvzPJl5Ic0lq7T4b1trX0\nV9M91rvqtUl+K0MQTJJU1ZMyLOt49Rj+r03yxCTPXmR2+6B5Y9o2s3llkhdM/qOntXav1trfTvSf\nfP+uzLCUYiFXJjly3r7u2Vq7quO13qFG44z7BzMsw1k7/mPs3NxepzvUfd7r+2aGf+A8emIc922t\nbTeoL8Edxttae3tr7fEZlpH8RJLfXKjfdpyeYYb+IRlq+cEFD7j9c2aHYwN2LcIzsCRV9ctVdf/W\n2g+T3DQ2/0uSb2RYz/vQie6nJ3lZVR1cVXtnmCl+/7jM4MwkP19VT6rhIr7XZ/Eg/GNJvp1ka1U9\nIsl/2WkvbMdjvUva8BFll+SOs8obM6y3fVSSx423x2QI2EfuYHe/WVX7VtVBSV6S5P1j+7syBPFH\nJ0kNFzceu4P9fDTJv6mql44X5f1YVT1xYl9v3Lbko6ruX1VHdb7c65Ksq9svZNwzw7rpbyS5raqO\nTDL5cXEfSPK8qnpkVd07w3KdJMn4O/XHSd5WVQ8Yx3JAVT19W5/xArsNnWPboar6t1X1xKr60STf\nSfK9DL/L217XQ7f75GG8n83wOt+d5GOttZsW6reDc2a+RY8JzI7wDCzVEUkureETKE5Kcty4hvbW\nJG9M8v/GP7kfluQ9GS4I+1SSr2YIJy9KknFN8osy/On7miS3JLk+w8Vo2/MbGf48fkuGkPX+HfS9\nq7Y71iV6bYYL01JV98yw1vd/ttaunbh9dTzmjpZunJXhorZ/SHJOklOSpLX2FxlmMc8Yl7B8ITsI\n4eP64f+Q5OczLMG4PMmTx4dPynCR3ser6pYMFw8+caH9LODPx5/fqqrPjMd5cYaQfGOGep09MY6/\nTPL2DBdCbs5wMV1ye91fNbZfML6u/5vk4UlSVQdmuMj0ks6xLeY+GX6PbsywJOZbGWbMk+F9ftT4\nu/zhHezj9CRPTfJ/dtBnwXNmgX4nZZjJvrGq3n7XXgpwd9t2dTDALmGc7b0pw5KMr856PLuCqmoZ\n3o/Nsx7L3aWqHpkh+N9jsVn+qvrlDEs6Xj2VwQFMEJ6Bmauqn8/wUXKV5A8yzHYe2vwHKsnKDc9V\n9YsZZtH3SnJakh+21o6e7agAdsyyDWBXcFSGC+CuznAx3XGC86rwggxrhb+cYe3vzly7DnC3MPMM\nAACdzDwDAEAn4RkAADrdla+bnbr999+/rVu3bibH/s53vpO99lros+tZSdR55VPj1UGdVwd1Xvlm\nWeOLL774m621Rb/JdJcOz+vWrctFF100k2PPzc1lw4YNMzk206POK58arw7qvDqo88o3yxpX1dd7\n+lm2AQAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8\nAwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQKc1sx4AAAALW7fp\nnFkPYapOPWKvWQ9hUWaeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQS\nngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA\n6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnP\nAADQSXgGAIBOwjMAAHRaNDxX1Xuq6vqq+sJE235VdV5VXT7+3Hdsr6p6e1VtrqrPV9WhE8/ZOPa/\nvKo23j0vBwAA7j49M8+nJjliXtumJOe31g5Jcv64nSRHJjlkvJ2Y5J3JELaTvC7JE5M8IcnrtgVu\nAADYXSwanltrn0pyw7zmo5KcNt4/LcnRE+3vbYMLkuxTVQ9M8vQk57XWbmit3ZjkvNw5kAMAwC5t\nzRKft7a1dk2StNauqaoHjO0HJLlyot+WsW177XdSVSdmmLXO2rVrMzc3t8QhLs/WrVtndmymR51X\nPjVeHdR5dViNdX7FY2+b9RCmaneo8VLD8/bUAm1tB+13bmzt5CQnJ8n69evbhg0bdtrg7oq5ubnM\n6thMjzqvfGq8Oqjz6rAa63z8pnNmPYSpOvWIvXb5Gi/10zauG5djZPx5/di+JclBE/0OTHL1DtoB\nAGC3sdTwfHaSbZ+YsTHJWRPtzx0/deOwJDePyzs+luRpVbXveKHg08Y2AADYbSy6bKOqTk+yIcn+\nVbUlw6dmvCnJB6rqhCRXJDl27H5ukmck2Zzk1iTPS5LW2g1V9XtJ/n7s919ba/MvQgQAgF3aouG5\ntfbs7Tx0+AJ9W5IXbmc/70nynrs0OgAA2IX4hkEAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMA\nAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2E\nZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAA\nOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIz\nAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACd\nhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkA\nADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAp2WF56p6WVVdWlVfqKrTq+qe\nVXVwVV1YVZdX1furas+x7z3G7c3j4+t2xgsAAIBpWXJ4rqoDkrw4yfrW2mOS7JHkuCRvTvK21toh\nSW5McsL4lBOS3Nhae1iSt439AABgt7HcZRtrktyrqtYkuXeSa5I8JcmZ4+OnJTl6vH/UuJ3x8cOr\nqpZ5fAAAmJolh+fW2lVJ3pLkigyh+eYkFye5qbV229htS5IDxvsHJLlyfO5tY//7LfX4AAAwbWuW\n+sSq2jfDbPLBSW5K8udJjlyga9v2lB08NrnfE5OcmCRr167N3NzcUoe4LFu3bp3ZsZkedV751Hh1\nUOfVYTXW+RWPvW3xTivI7lDjJYfnJE9N8tXW2jeSpKo+lORJSfapqjXj7PKBSa4e+29JclCSLeMy\nj/smuWH+TltrJyc5OUnWr1/fNmzYsIwhLt3c3FxmdWymR51XPjVeHdR5dViNdT5+0zmzHsJUnXrE\nXrt8jZez5vmKJIdV1b3HtcuHJ/likk8mOWbsszHJWeP9s8ftjI9/orV2p5lnAADYVS1nzfOFGS78\n+0ySS8Z9nZzkVUleXlWbM6xpPmV8yilJ7je2vzzJpmWMGwAApm45yzbSWntdktfNa/5Kkics0Pd7\nSY5dzvEAAGCWfMMgAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAA\ndBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRn\nAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6\nCc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMA\nAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2E\nZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAA\nOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnZYVnqtqn6o6s6q+VFWXVdVPV9V+VXVeVV0+/tx3\n7FtV9faq2lxVn6+qQ3fOSwAAgOlY7szzSUn+qrX2iCQ/meSyJJuSnN9aOyTJ+eN2khyZ5JDxdmKS\ndy7z2AAAMFVLDs9VdZ8kP5fklCRprX2/tXZTkqOSnDZ2Oy3J0eP9o5K8tw0uSLJPVT1wySMHAIAp\nq9ba0p5Y9bgkJyf5YoZZ54uTvCTJVa21fSb63dha27eqPprkTa21T4/t5yd5VWvtonn7PTHDzHTW\nrl37+DPOOGNJ41uurVu3Zu+9957JsZkedV751Hh1UOfVYTXW+ZKrbp71EKbq4PvuMbMaP/nJT764\ntbZ+sX5rlnGMNUkOTfKi1tqFVXVSbl+isZBaoO1Oyb21dnKGUJ7169e3DRs2LGOISzc3N5dZHZvp\nUeeVT41XB3VeHVZjnY/fdM6shzBVpx6x1y5f4+Wsed6SZEtr7cJx+8wMYfq6bcsxxp/XT/Q/aOL5\nBya5ehnHBwCAqVpyeG6tXZvkyqp6+Nh0eIYlHGcn2Ti2bUxy1nj/7CTPHT9147AkN7fWrlnq8QEA\nYNqWs2wjSV6U5H1VtWeSryR5XoZA/oGqOiHJFUmOHfuem+QZSTYnuXXsCwAAu41lhefW2j8kWWhh\n9eEL9G1JXric4wEAwCz5hkEAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAA\nnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZ\nAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBO\nwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwA\nAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfh\nGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCA\nTsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAp2WH56rao6o+W1UfHbcPrqoLq+ryqnp/Ve05tt9j\n3N48Pr5uuccGAIBp2hkzzy9JctnE9puTvK21dkiSG5OcMLafkOTG1trDkrxt7AcAALuNZYXnqjow\nyTOTvHvcriRPSXLm2OW0JEeP948atzM+fvjYHwAAdgvLnXn+wySvTPLDcft+SW5qrd02bm9JcsB4\n/4AkVybJ+PjNY38AANgtrFnqE6vqWUmub61dXFUbtjUv0LV1PDa53xOTnJgka9euzdzc3FKHuCxb\nt26d2bGZHnVe+dR4dVDn1WE11vkVj71t8U4ryO5Q4yWH5yQ/k+QXquoZSe6Z5D4ZZqL3qao14+zy\ngUmuHvtvSXJQki1VtSbJfZPcMH+nrbWTk5ycJOvXr28bNmxYxhCXbm5uLrM6NtOjziufGq8O6rw6\nrMY6H7/pnFkPYapOPWKvXb7GS1620Vp7dWvtwNbauiTHJflEa+05ST6Z5Jix28YkZ433zx63Mz7+\nidbanWaeAQBgV3V3fM7zq5K8vKo2Z1jTfMrYfkqS+43tL0+y6W44NgAA3G2Ws2zjX7XW5pLMjfe/\nkuQJC/T5XpJjd8bxAABgFnzDIAAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgG\nAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT\n8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMA\nQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4\nBgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCg\nk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwD\nAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT8AwAAJ2WHJ6r6qCq+mRVXVZVl1bVS8b2/arq\nvKq6fPy579heVfX2qtpcVZ+vqkN31osAAIBpWM7M821JXtFae2SSw5K8sKoelWRTkvNba4ckOX/c\nTpIjkxwy3k5M8s5lHBsAAKZuyeG5tXZNa+0z4/1bklyW5IAkRyU5bex2WpKjx/tHJXlvG1yQZJ+q\neuCSRw4AAFNWrbXl76RqXZJPJXlMkitaa/tMPHZja23fqvpokje11j49tp+f5FWttYvm7evEDDPT\nWbt27ePPOOOMZY9vKbZu3Zq99957JsdmetR55VPj1UGdV4fVWOdLrrp51kOYqoPvu8fMavzkJz/5\n4tba+sX6rVnugapq7yQfTPLS1tq3q2q7XRdou1Nyb62dnOTkJFm/fn3bsGHDcoe4JHNzc5nVsZke\ndV751Hh1UOfVYTXW+fhN58x6CFN16hF77fI1XtanbVTVj2YIzu9rrX1obL5u23KM8ef1Y/uWJAdN\nPP3AJFcv5/gAADBNy/m0jUpySpLLWmtvnXjo7CQbx/sbk5w10f7c8VM3Dktyc2vtmqUeHwAApm05\nyzZ+JsmvJLmkqv5hbHtNkjcl+UBVnZDkiiTHjo+dm+QZSTYnuTXJ85ZxbAAAmLolh+fxwr/tLXA+\nfIH+LckLl3o8AACYNd8wCAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4BgCATsIzAAB0Ep4BAKCT\n8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMA\nQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADoJzwAA0El4\nBgCATsIzAAB0Ep4BAKCT8AwAAJ2EZwAA6CQ8AwBAJ+EZAAA6Cc8AANBJeAYAgE7CMwAAdBKeAQCg\n05pZD2BXdclVN+f4TefMehhT9bU3PXPWQwAA2KWZeQYAgE7CMwAAdBKeAQCgk/AMAACdhGcAAOgk\nPAMAQCfhGQAAOgnPAADQSXgGAIBOwjMAAHQSngEAoJPwDAAAnYRnAADoJDwDAEAn4RkAADqtmfUA\nAGC5Lrnq5hy/6ZxZD2OqvvamZ856CFO3GuvMrsfMMwAAdBKeAQCgk/AMAACdhGcAAOjkgkEA2A2t\nW4UXzr3isbMeAZh5BgCAbsIzAAB0Ep4BAKCT8AwAAJ2mfsFgVR2R5KQkeyR5d2vtTdMeA2yz2r6t\nyjeSsVK5kAyYlqnOPFfVHkn+KMmRSR6V5NlV9ahpjgEAAJZq2jPPT0iyubX2lSSpqjOSHJXki1Me\nB6xKPtoKAJZn2uH5gCRXTmxvSfLEKY+B7RCsAAB2rFpr0ztY1bFJnt5ae/64/StJntBae9FEnxOT\nnDhuPjzJP05tgHe0f5JvzujYTI86r3xqvDqo8+qgzivfLGv8kNba/RfrNO2Z5y1JDprYPjDJ1ZMd\nWmsnJzl5moNaSFVd1FpbP+txcPdS55VPjVcHdV4d1Hnl2x1qPO2Pqvv7JIdU1cFVtWeS45KcPeUx\nAADAkkx15rm1dltV/XqSj2X4qLr3tNYuneYYAABgqab+Oc+ttXOTnDvt4y7BzJeOMBXqvPKp8eqg\nzquDOq98u3yNp3rBIAAA7M58PTcAAHQSnuepqiOq6h+ranNVbZr1eNg5quqgqvpkVV1WVZdW1UvG\n9v2q6ryqunz8ue+sx8ryVNUeVfXZqvrouH1wVV041vj948XK7Maqap+qOrOqvjSe0z/tXF55qupl\n43+vv1BVp1fVPZ3Pu7+qek9VXV9VX5hoW/D8rcHbx0z2+ao6dHYjv53wPMHXh69otyV5RWvtkUkO\nS/LCsbabkpzfWjskyfnjNru3lyS5bGL7zUneNtb4xiQnzGRU7EwnJfmr1tojkvxkhno7l1eQqjog\nyYuTrG+tPSbDhwwcF+fzSnBqkiPmtW3v/D0yySHj7cQk75zSGHdIeL6jf/368Nba95Ns+/pwdnOt\ntWtaa58Z79+S4X+2B2So72ljt9OSHD2bEbIzVNWBSZ6Z5N3jdiV5SpIzxy5qvJurqvsk+bkkpyRJ\na+37rbWb4lxeidYkuVdVrUly7yTXxPm822utfSrJDfOat3f+HpXkvW1wQZJ9quqB0xnp9gnPd7TQ\n14cfMKOxcDepqnVJfirJhUnWttauSYaAneQBsxsZO8EfJnllkh+O2/dLclNr7bZx2zm9+3tokm8k\n+ZNxec67q2qvOJdXlNbaVUnekuSKDKH55iQXx/m8Um3v/N0lc5nwfEe1QJuPI1lBqmrvJB9M8tLW\n2rdnPR52nqp6VpLrW2sXTzYv0NU5vXtbk+TQJO9srf1Uku/EEo0VZ1zzelSSg5M8KMleGf6EP5/z\neWXbJf8bLjzf0aJfH87uq6p+NENwfl9r7UNj83Xb/gQ0/rx+VuNj2X4myS9U1dcyLLl6SoaZ6H3G\nP/smzumVYEuSLa21C8ftMzNwfFiBAAABZElEQVSEaefyyvLUJF9trX2jtfaDJB9K8qQ4n1eq7Z2/\nu2QuE57vyNeHr1Dj2tdTklzWWnvrxENnJ9k43t+Y5Kxpj42do7X26tbaga21dRnO3U+01p6T5JNJ\njhm7qfFurrV2bZIrq+rhY9PhSb4Y5/JKc0WSw6rq3uN/v7fV2fm8Mm3v/D07yXPHT904LMnN25Z3\nzJIvSZmnqp6RYbZq29eHv3HGQ2InqKqfTfI3SS7J7ethX5Nh3fMHkjw4w3+sj22tzb+Qgd1MVW1I\n8huttWdV1UMzzETvl+SzSX65tfbPsxwfy1NVj8twUeieSb6S5HkZJoOcyytIVb0+yS9l+LSkzyZ5\nfob1rs7n3VhVnZ5kQ5L9k1yX5HVJPpwFzt/xH07vyPDpHLcmeV5r7aJZjHuS8AwAAJ0s2wAAgE7C\nMwAAdBKeAQCgk/AMAACdhGcAAOgkPAMAQCfhGQAAOgnPAADQ6f8DwG9L6a6sP3QAAAAASUVORK5C\nYII=\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x7fb92dfb3748>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "plt.figure(figsize=(12,8))\n", | |
| "plt.title('Histogram of NA percentage, first visit')\n", | |
| "cols = df.columns[~df.columns.str.contains('-0')]#[col for col in df.columns if col.str.contains('-0')]\n", | |
| "df.drop(cols, axis=1, inplace=True)\n", | |
| "(df.isna().sum() / len(df) * 100).hist();" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 2nd pattern: multiple answers." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 41204-0.0\n", | |
| "1 41204-0.1\n", | |
| "2 41204-0.2\n", | |
| "3 41204-0.3\n", | |
| "4 41204-0.4\n", | |
| "5 41204-0.5\n", | |
| "6 41204-0.6\n", | |
| "7 41204-0.7\n", | |
| "8 41204-0.8\n", | |
| "9 41204-0.9\n", | |
| "10 41204-0.10\n", | |
| "11 41204-0.11\n", | |
| "12 41204-0.12\n", | |
| "13 41204-0.13\n", | |
| "14 41204-0.14\n", | |
| "15 41204-0.15\n", | |
| "16 41204-0.16\n", | |
| "17 41204-0.17\n", | |
| "18 41204-0.18\n", | |
| "19 41204-0.19\n", | |
| "20 41204-0.20\n", | |
| "21 41204-0.21\n", | |
| "22 41204-0.22\n", | |
| "23 41204-0.23\n", | |
| "24 41204-0.24\n", | |
| "25 41204-0.25\n", | |
| "26 41204-0.26\n", | |
| "27 41204-0.27\n", | |
| "28 41204-0.28\n", | |
| "29 41204-0.29\n", | |
| " ... \n", | |
| "405 41204-0.405\n", | |
| "406 41204-0.406\n", | |
| "407 41204-0.407\n", | |
| "408 41204-0.408\n", | |
| "409 41204-0.409\n", | |
| "410 41204-0.410\n", | |
| "411 41204-0.411\n", | |
| "412 41204-0.412\n", | |
| "413 41204-0.413\n", | |
| "414 41204-0.414\n", | |
| "415 41204-0.415\n", | |
| "416 41204-0.416\n", | |
| "417 41204-0.417\n", | |
| "418 41204-0.418\n", | |
| "419 41204-0.419\n", | |
| "420 41204-0.420\n", | |
| "421 41204-0.421\n", | |
| "422 41204-0.422\n", | |
| "423 41204-0.423\n", | |
| "424 41204-0.424\n", | |
| "425 41204-0.425\n", | |
| "426 41204-0.426\n", | |
| "427 41204-0.427\n", | |
| "428 41204-0.428\n", | |
| "429 41204-0.429\n", | |
| "430 41204-0.430\n", | |
| "431 41204-0.431\n", | |
| "432 41204-0.432\n", | |
| "433 41204-0.433\n", | |
| "434 41204-0.434\n", | |
| "Length: 435, dtype: object" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pd.Series(df.columns[df.columns.str.contains('41204-')])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## 3rd pattern: pathological columns >67%NA\n", | |
| "\n", | |
| "For example: hierarchical questions" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df = pd.read_pickle(DATA2 + \"ukbb_fluid_intelligence_filtered.pkl\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "raw", | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "source": [ | |
| "# Warning! remove columns 20002, 20008, 20009\n", | |
| "cols = df.columns[~df.columns.str.contains(\"20002-\")\n", | |
| " & ~df.columns.str.contains(\"20008-\")\n", | |
| " & ~df.columns.str.contains(\"20009-\")\n", | |
| " & ~df.columns.str.contains(\"40001-\")\n", | |
| " & ~df.columns.str.contains(\"40002-\") \n", | |
| " & ~df.columns.str.contains(\"41202-\")\n", | |
| " & ~df.columns.str.contains(\"41204\")]\n", | |
| "df = df[cols]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Prediction: remove columns related to the response" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "target_column = '20016-0.0'\n", | |
| "lkd_to_tgt = ['20016-0.0', '20128-0.0', '4935-0.0', '4946-0.0',\n", | |
| " '4957-0.0', '4968-0.0', '4979-0.0', '4990-0.0', '5001-0.0',\n", | |
| " '5012-0.0', '5556-0.0', '5699-0.0', '5779-0.0', '5790-0.0',\n", | |
| " '5866-0.0']\n", | |
| "y = df[target_column].values.ravel()\n", | |
| "df = df.drop(columns=[target_column] + lkd_to_tgt, errors='ignore')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "missing = df.isna().sum() / len(df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "63.86554621848739" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "(missing == 0).sum() / len(missing) * 100" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAAHiCAYAAAANlMFMAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAIABJREFUeJzt3XuYZGddJ/DvjwyXkAkJ4TJCEhm5\niFyCXGa5CIszgBhIEHweERAwQdyoeAGfiATEBVTWuBqUXXfVCBg0wIARFBMEskCDrIAmXAwhIJcE\nQhISICSkA8ImvvvHOSOVdnq6011d1Zn383meerrOpc771u+cqv7WqbeqqrUWAADo0c3m3QEAAJgX\nYRgAgG4JwwAAdEsYBgCgW8IwAADdEoYBAOiWMAwbqKrOr6qd8+7HPFXVj1bVxVW1WFUPmHd/YFqq\n6qVVdfqNWL9V1d03sk/AjScMwxpV1UVV9Zgl846vqvfvmW6t3ae1trDCdraP/yS3bFBX5+33kvxC\na21ra+0jSxeO9/28qrrZxLzfqqrTlqx30Bio37bxXd54NzZI0Y9ZPSd08NwDqyIMw35uE/yju0uS\n81dY585JnrrCOj+W5FtJHltVd5pGx1ayCWq36Uy7JmoMzJswDBto8uxxVT24qs6pqq9X1eVV9Ypx\ntfeNf68az3w+rKpuVlUvrqrPV9UVVfXnVXXIxHZ/clz21ar69SXtvLSqzqiq06vq60mOH9v+QFVd\nVVWXVdUfVtUtJrbXquo5VfXpqrqmqn6zqu423ubrVfWmyfWX3Me99rWqbllVi0kOSPKxqvrsPkr1\n35O8bIVgdFySP07yz0mevkLdW1X9UlV9rqq+UlW/u+TM809V1QVV9bWqekdV3WXJbX++qj6d5NPj\nvPtU1dlVdeW47140cd9PqqrPjvviTVV12Lhsz1m346rqC2M/fm1cdnSSFyV5yrjPPzbOf9bYr2vG\nvv/Mkvv1q+P+u7Sqfrom3nYf6/17Y1uXV9UfV9WB+6rTxHZPG9c/e2z7vauoyfdN1ORTVfXj69ze\nD1TVP1XV1ePfH5hY/7Cq+rPxfn+tqv56YtmxVfXR8dj+h6q638SyF1TVJWMfPlVVjx7nL/dYTFU9\ndNzOVVX1sZoY5lRV3zPel2uq6uwkt1+hrs+f2F8/tWTZMVX1kbEPF1fVSycW7+054W5V9e7xOPtK\nVb2uqg5dxX1d9hjdWzv7uj+w32qtubi4rOGS5KIkj1ky7/gk79/bOkk+kOSZ4/WtSR46Xt+epCXZ\nMnG7n0rymSR3Hdd9c5K/GJfdO8likkckuUWGYQj/b6Kdl47TT8rwgvfAJA9K8tAkW8b2LkjyvIn2\nWpK3JrlNkvtkOAP7rrH9Q5J8Islxy9Rh2b5ObPvu+6hjS3KPJOcm+elx3m8lOW1ine9O8m/jfT8x\nyT+vsG9akvckOWy87b9MbPtJY3/vNdbjxUn+Ycltzx5ve2CSg5NcNrZ7q3H6IeO6z0vywSRHJLll\nkj9J8oYl+/VPx+18/1jXe03sp9OX9PuYJHdLUkl+MMk3kjxwXHZ0ki+N++fWSf5isrZJ/mDch4eN\nffzbJL+9ymP5tCTXJHnkeD9emRsex0trclCSi5M8a6zhA5N8Jcl91ri9w5J8Lckzx+09bZy+3bj+\nWUnemOS2SW6e5AfH+Q9MckWSh2R40XVchsfcLZPcc+zjnSf2x91WeCwenuSrSR6f4bHzQ+P0HSZu\n94px+48c7+Ppy9T06CSXJ7nvWK/XL9lfO5McNbZzv3HdJ+3jOeHuY39umeQOGYLsH4zL9nVfV3OM\nblnu2HBx6eEy9w64uNxUL+M/3cUkV01cvpHlw/D7krwsye2XbGdv//jeleQ5E9P3zBBwtyT5r3v+\nmY3Lbp3k27lhGH7fCn1/XpK3TEy3JA+fmD43yQsmpk/Z8493L9tatq8T214pDN99DCBfGP9hLw3D\nL07y0fH6nZNcn+QBK2zz6Inp5yR513j975I8e2LZzcb9dpeJ2z5qYvnTknxkmXYuSPLoiek7Teyn\nPfv1iInl/5jkqRP7aa9BamL9v07y3PH6azIRbsea7aldJbk2YwAalz8syYWrPJZPS7J7YnrrWOMj\nl6nJU5L8/ZJt/EmSl6xxe89M8o9LtveBDC8u75ThhdBt99LvP0rym0vmfSrDC4m7ZwjKj0ly8yXr\nLPdYfEEmXsiN896RIWR/d5Lrkhw0sez1y+3DcX+dPDH9vdnHYyHDi5nfH6/vOXaWDakZXtR9ZOJY\nWO6+ruYYFYZdur4YJgHr86TW2qF7LhlC13KeneEf4ifHt4GP3ce6d07y+Ynpz2f457VtXHbxngWt\ntW9kOHs16eLJiar63qo6s6q+VMPQif+W//gW7+UT17+5l+mta+jrqrXW3pYhDJ+wl8U/meR143qX\nJnlvhoCyL5M1+PzYz2QYw/zK8W3wq5JcmSFMHr7MbY9MstwQj7skecvEti7IEPom7/uXJq5/I8vX\nMVX1uKr64Dj04KoMLxD27Kcb7Pcl1++Q4UXRuRN9efs4f7Umj6nFDHW5896WZ7jfD9nT1tje05N8\n1xq3t/QYyjh9eIb6X9la+9pe+nyXJCcu6ceRGc6QfibDi76XJrmiqnZX1Z72l3ss3iXJk5ds7xEZ\nAuSdk3yttXbtkj4uZ+n+usG6VfWQqnpPVX25qq5O8rPZx7CLqrrjeB8uGR/Dp+9Zf4X7uppjFLom\nDMOMtNY+3Vp7WpI7JvmdJGdU1UEZzswsdWmGf2J77DkrdXmGt+yP2LOghnGht1va3JLpP0ryyST3\naK3dJsN41Vr7vVl1X2+sFyf5tQzBLskwljTDMIoXjmH+SxneFn9a7XuM8ZFL+nTpeP3iJD8z+SKm\ntXZga+0fJtafrN/FGYYu7M3FSR63ZFu3aq1dsor7eoN9VFW3TPJXGYa9bBtfXL0t39lPN9jvS+7f\nVzK8YLnPRD8Oaa0tG7z34t+3V1VbMwxduHRi+dKavHfJ/d7aWvu5NW5v6TGUDPvskrGtwybHxy7p\nx8uX9OPWrbU3JElr7fWttUeM224ZHnf7eixenOHM8OT2DmqtnZyh/rcd15vs43Iuy388Bie9PsOw\nliNba4dkGA+/Z1/v7Tnht8f59xsfw8+YWH/Z+5p9H6N7awe6IwzDjFTVM6rqDq21f8swpCIZztB8\nOcPbwHedWP0NSX55/MDO1gxnct/YWrsuyRlJnjB+4OgWGd7uXSnYHpzk60kWq+r7kvzcCuvfGPvq\n643Shq+hOy83POt7XIbxpfdOcv/xct8Mgflx+9jc86vqtlV1ZJLnZhhzmgyh44VVdZ8kqeHDfk/e\nx3bOTPJdVfW8Gj6kdnBVPWRiWy+v8cNhVXWHqnriKu/u5Um213c+2HeLDENEvpzkuqp6XJLHTqz/\npiTPqqp7VdWtMwyXSZKMx9SfJvn9qrrj2JfDq+qH96xTw4fWdu6jP4+vqkeMx9RvJvlQa+3iZdY9\nM8n3VtUzq+rm4+U/VdW91ri9t43b+4mq2lJVT8mwv89srV2WYWjL/x73582r6pHj7f40yc+OZ1mr\nhq/fO2bcR/esqkeNLzL+NcOLhevHWiz3WDw9w2Prh6vqgKq6VVXtrKojWmufT3JOhg963qKqHpHk\nCfuo55syfHj13uP+esmS5QdnOOP9r1X14CQ/MbFsb88JB2ccllVVhyd5/p4F+7qv2fcxurd2oDvC\nMMzO0UnOr+EbFl6ZYezov47DHF6e5P+Ob2U+NMN4w7/IMLbxwgz/4H4xSVpr54/Xd2c4+3RNhvGC\n39pH27+S4Z/tNRkCxBv3se6NtWxf1+jFGc4ipqpuleTHk/zP1tqXJi4Xjm3ua6jE32QY+/zRDB/A\nenWStNbekuGs2e7x7eaPZx+hurV2TYYPLj0hw5CHTyfZNS5+ZYaze++sqmsyfFDpIXvbzl785fj3\nq1X14bGdX8oQor6WYX+9daIff5fkf2T4YOBnMoypTb6z318wzv/geL/+T4bx26mqIzIEqfP20Z/X\nZwhsV2b4wOWy39gx9vWxGb4O79IMdfmdDGF+Ldv7apJjM3xI8atJfjXJsa21r4yrPDPDONdPZjjW\nnzfe7pwk/yXJH2ao2WcyjDPO2JeTM5w1/1KGs8AvGpct91i8OMkTx/W+nOGs6vPznf+VP5Fh/145\n3rc/38d9+rsM44DfPfbr3UtWeU6S3xiPm/+aYb/vue3enhNeluEDg1dnOJ7fPLGtfd3XZY/RZdqB\n7lRr3iWBm7LxbOxVGYZAXDjv/mwGVdUy1OMz8+7LRhnPwn48yS1XOgtfVc/IMITihcssPy3JF1tr\nL55S36a6PYCN5MvO4Saoqp6Q4VscKsMY0/MyfHMF+7Gq+tEMZwUPynAm9m9XMxylteaX7gCWYZgE\n3DQ9McPb05dm+HDZU5u3eXrwMxnevv9shjGh0xz7DdAlwyQAAOiWM8MAAHRLGAYAoFsz/QDd7W9/\n+7Z9+/ZZNplrr702Bx100MorMlXqPh/qPj9qPx/qPh/qPj9qv3rnnnvuV1prK/4S50zD8Pbt23PO\nOefMssksLCxk586dM20TdZ8XdZ8ftZ8PdZ8PdZ8ftV+9qtrXT6b/O8MkAADoljAMAEC3hGEAALol\nDAMA0C1hGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAtYRgAgG4JwwAA\ndEsYBgCgW8IwAADdEoYBAOiWMAwAQLeEYQAAurVl3h2Yhe0nnTXvLszcRScfM+8uAABses4MAwDQ\nLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDoljAMAEC3hGEAALolDAMA0C1hGACAbgnDAAB0SxgG\nAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAtYRgAgG4JwwAAdEsYBgCgW8IwAADdEoYBAOiW\nMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRrxTBcVa+pqiuq6uMT8363qj5ZVf9cVW+pqkM3tpsA\nADB9qzkzfFqSo5fMOzvJfVtr90vyL0leOOV+AQDAhlsxDLfW3pfkyiXz3tlau26c/GCSIzagbwAA\nsKGqtbbySlXbk5zZWrvvXpb9bZI3ttZOX+a2JyQ5IUm2bdv2oN27d6+nvzfa4uJiLrz6+pm2uRkc\ndfghc21/cXExW7dunWsfeqTu86P286Hu86Hu86P2q7dr165zW2s7Vlpvy3oaqapfS3Jdktctt05r\n7dQkpybJjh072s6dO9fT5I22sLCQU95/7Uzb3AwuevrOuba/sLCQWe9r1H2e1H4+1H0+1H1+1H76\n1hyGq+q4JMcmeXRbzellAADYZNYUhqvq6CQvSPKDrbVvTLdLAAAwG6v5arU3JPlAkntW1Rer6tlJ\n/jDJwUnOrqqPVtUfb3A/AQBg6lY8M9xae9peZr96A/oCAAAz5RfoAADoljAMAEC3hGEAALolDAMA\n0C1hGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAtYRgAgG4JwwAAdEsY\nBgCgW8IwAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDo\nljAMAEC3hGEAALolDAMA0C1hGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwD\nANAtYRgAgG4JwwAAdEsYBgCgW8IwAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRL\nGAYAoFvCMAAA3RKGAQDoljAMAEC3hGEAALolDAMA0C1hGACAbgnDAAB0a8UwXFWvqaorqurjE/MO\nq6qzq+rT49/bbmw3AQBg+lZzZvi0JEcvmXdSkne11u6R5F3jNAAA3KSsGIZba+9LcuWS2U9M8trx\n+muTPGnK/QIAgA231jHD21prlyXJ+PeO0+sSAADMRrXWVl6panuSM1tr9x2nr2qtHTqx/Guttb2O\nG66qE5KckCTbtm170O7du6fQ7dVbXFzMhVdfP9M2N4OjDj9kru0vLi5m69atc+1Dj9R9ftR+PtR9\nPtR9ftR+9Xbt2nVua23HSuttWeP2L6+qO7XWLquqOyW5YrkVW2unJjk1SXbs2NF27ty5xibXZmFh\nIae8/9qZtrkZXPT0nXNtf2FhIbPe16j7PKn9fKj7fKj7/Kj99K11mMRbkxw3Xj8uyd9MpzsAADA7\nq/lqtTck+UCSe1bVF6vq2UlOTvJDVfXpJD80TgMAwE3KisMkWmtPW2bRo6fcFwAAmCm/QAcAQLeE\nYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDoljAMAEC3hGEAALolDAMA0C1hGACA\nbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAtYRgAgG4JwwAAdEsYBgCgW8Iw\nAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDoljAMAEC3\nhGEAALolDAMA0C1hGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAtYRgA\ngG4JwwAAdEsYBgCgW8IwAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYAoFvC\nMAAA3VpXGK6qX66q86vq41X1hqq61bQ6BgAAG23NYbiqDk/yS0l2tNbum+SAJE+dVscAAGCjrXeY\nxJYkB1bVliS3TnLp+rsEAACzseYw3Fq7JMnvJflCksuSXN1ae+e0OgYAAButWmtru2HVbZP8VZKn\nJLkqyV8mOaO1dvqS9U5IckKSbNu27UG7d+9eV4dvrMXFxVx49fUzbXMzOOrwQ+ba/uLiYrZu3TrX\nPvRI3edH7edD3edD3edH7Vdv165d57bWdqy03pZ1tPGYJBe21r6cJFX15iQ/kOQGYbi1dmqSU5Nk\nx44dbefOneto8sZbWFjIKe+/dqZtbgYXPX3nXNtfWFjIrPc16j5Paj8f6j4f6j4/aj996xkz/IUk\nD62qW1dVJXl0kgum0y0AANh46xkz/KEkZyT5cJLzxm2dOqV+AQDAhlvPMIm01l6S5CVT6gsAAMyU\nX6ADAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAtYRgAgG4JwwAAdEsYBgCgW8IwAADdEoYB\nAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDoljAMAEC3hGEAALol\nDAMA0C1hGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6NaWeXcAAKAH2086a93bOPGo63L8FLYzKxed\nfMy8u7AiZ4YBAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDoljAM\nAEC3hGEAALolDAMA0C1hGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAt\nYRgAgG4JwwAAdEsYBgCgW8IwAADdEoYBAOiWMAwAQLeEYQAAurWuMFxVh1bVGVX1yaq6oKoeNq2O\nAQDARtuyztu/MsnbW2s/VlW3SHLrKfQJAABmYs1huKpuk+SRSY5Pktbat5N8ezrdAgCAjbeeYRJ3\nTfLlJH9WVR+pqldV1UFT6hcAAGy4aq2t7YZVO5J8MMnDW2sfqqpXJvl6a+3Xl6x3QpITkmTbtm0P\n2r179zq7fOMsLi7mwquvn2mbm8FRhx8y1/YXFxezdevWufahB+ddcvUNprcdmFz+zTl1ZkbmfWwv\nxzE/XUuP7eXsL8f8Zj2ul+N4X5vVHtf7clM75ud5bO/atevc1tqOldZbTxj+riQfbK1tH6f/c5KT\nWmvHLHebHTt2tHPOOWdN7a3VwsJCjn/7tTNtczO46ORld8NMLCwsZOfOnXPtQw+2n3TWDaZPPOq6\nnHLeej8KsLnN+9hejmN+upYe28vZX475zXpcL8fxvjarPa735aZ2zM/z2K6qVYXhNQ+TaK19KcnF\nVXXPcdajk3xirdsDAIBZW+9Li19M8rrxmyQ+l+RZ6+8SAADMxrrCcGvto0lWPP0MAACbkV+gAwCg\nW8IwAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDoljAM\nAEC3hGEAALolDAMA0C1hGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAt\nYRgAgG4JwwAAdEsYBgCgW8IwAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYA\noFvCMAAA3RKGAQDoljAMAEC3hGEAALolDAMA0C1hGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYw\nDABAt4RhAAC6JQwDANAtYRgAgG4JwwAAdEsYBgCgW8IwAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQ\nLWEYAIBurTsMV9UBVfWRqjpzGh0CAIBZmcaZ4ecmuWAK2wEAgJlaVxiuqiOSHJPkVdPpDgAAzE61\n1tZ+46ozkvx2koOT/Epr7di9rHNCkhOSZNu2bQ/avXv3mttbi8XFxVx49fUzbZNk24HJ5d+cbZtH\nHX7IbBvcBM675OobTM+j7rO2Wffz4uJitm7dOu9u7DeWHtvL2V+O+c16XC/H8b42qz2u9+WmdszP\n89jetWvXua21HSutt2WtDVTVsUmuaK2dW1U7l1uvtXZqklOTZMeOHW3nzmVX3RALCws55f3XzrRN\nkhOPui6nnLfmw2tNLnr6zpm2txkcf9JZN5ieR91nbbPu54WFhcz6+W1/tvTYXs7+csxv1uN6OY73\ntVntcb0vN7Vj/qZwbK9nmMTDk/xIVV2UZHeSR1XV6VPpFQAAzMCaw3Br7YWttSNaa9uTPDXJu1tr\nz5hazwAAYIP5nmEAALo1lUEnrbWFJAvT2BYAAMyKM8MAAHRLGAYAoFvCMAAA3RKGAQDoljAMAEC3\nhGEAALolDAMA0C1hGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAtYRgA\ngG4JwwAAdEsYBgCgW8IwAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYAoFtb\n5t0BAOjd9pPOmncXbpQTj7oux6+zzxedfMyUegPr48wwAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQ\nLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDoljAMAEC3hGEAALolDAMA0C1hGACAbgnDAAB0SxgG\nAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAtYRgAgG4JwwAAdEsYBgCgW8IwAADdEoYBAOiW\nMAwAQLfWHIar6siqek9VXVBV51fVc6fZMQAA2Ghb1nHb65Kc2Fr7cFUdnOTcqjq7tfaJKfUNAAA2\n1JrPDLfWLmutfXi8fk2SC5IcPq2OAQDARpvKmOGq2p7kAUk+NI3tAQDALFRrbX0bqNqa5L1JXt5a\ne/Nelp+Q5IQk2bZt24N27969rvZurMXFxVx49fUzbZNk24HJ5d+cbZtHHX7IbBvcBM675OobTM+j\n7rO2Wffz4uJitm7duiHbXrqf+Y4ejvnNaBp136yP5Y00jcfyTe2Yn+d+3rVr17mttR0rrbeuMFxV\nN09yZpJ3tNZesdL6O3bsaOecc86a21uLhYWFHP/2a2faJsmJR12XU85bz5D0G++ik4+ZaXubwfaT\nzrrB9DzqPmubdT8vLCxk586dG7LtpfuZ7+jhmN+MplH3zfpY3kjTeCzf1I75ee7nqlpVGF7Pt0lU\nklcnuWA1QRgAADab9YwZfniSZyZ5VFV9dLw8fkr9AgCADbfm8+yttfcnqSn2BQAAZsov0AEA0C1h\nGACAbgnDAAB0SxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAtYRgAgG4JwwAAdEsYBgCg\nW8IwAADdEoYBAOiWMAwAQLeEYQAAuiUMAwDQLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDoljAM\nAEC3hGEAALolDAMA0C1hGACAbgnDAAB0a8u8OwDTsv2ks+bdBWZgs+7nE4+6Lsdv0r4BsDxnhgEA\n6JYwDABAt4RhAAC6JQwDANAtYRgAgG4JwwAAdEsYBgCgW8IwAADdEoYBAOiWMAwAQLeEYQAAuiUM\nAwDQLWEYAIBuCcMAAHRLGAYAoFvCMAAA3RKGAQDoljAMAEC3hGEAALolDAMA0C1hGACAbgnDAAB0\nSxgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6ta4wXFVHV9WnquozVXXStDoFAACzsOYwXFUHJPlf\nSR6X5N5JnlZV955WxwAAYKOt58zwg5N8prX2udbat5PsTvLE6XQLAAA23nrC8OFJLp6Y/uI4DwAA\nbhKqtba2G1Y9OckPt9Z+epx+ZpIHt9Z+ccl6JyQ5YZy8Z5JPrb27a3L7JF+ZcZuo+7yo+/yo/Xyo\n+3yo+/yo/erdpbV2h5VW2rKOBr6Y5MiJ6SOSXLp0pdbaqUlOXUc761JV57TWdsyr/V6p+3yo+/yo\n/Xyo+3yo+/yo/fStZ5jEPyW5R1V9T1XdIslTk7x1Ot0CAICNt+Yzw62166rqF5K8I8kBSV7TWjt/\naj0DAIANtp5hEmmtvS3J26bUl40ytyEanVP3+VD3+VH7+VD3+VD3+VH7KVvzB+gAAOCmzs8xAwDQ\nrf02DPup6NmpqtdU1RVV9fGJeYdV1dlV9enx723n2cf9UVUdWVXvqaoLqur8qnruOF/tN1BV3aqq\n/rGqPjbW/WXj/O+pqg+NdX/j+MFipqyqDqiqj1TVmeO0us9AVV1UVedV1Uer6pxxnueaDVZVh1bV\nGVX1yfG5/mHqPn37ZRj2U9Ezd1qSo5fMOynJu1pr90jyrnGa6bouyYmttXsleWiSnx+Pc7XfWN9K\n8qjW2vcnuX+So6vqoUl+J8nvj3X/WpJnz7GP+7PnJrlgYlrdZ2dXa+3+E1/r5blm470yydtba9+X\n5PszHPvqPmX7ZRiOn4qeqdbEbIaSAAAC00lEQVTa+5JcuWT2E5O8drz+2iRPmmmnOtBau6y19uHx\n+jUZniQPj9pvqDZYHCdvPl5akkclOWOcr+4boKqOSHJMkleN0xV1nyfPNRuoqm6T5JFJXp0krbVv\nt9auirpP3f4ahv1U9Pxta61dlgyhLckd59yf/VpVbU/ygCQfitpvuPGt+o8muSLJ2Uk+m+Sq1tp1\n4yqeczbGHyT51ST/Nk7fLuo+Ky3JO6vq3PGXZRPPNRvtrkm+nOTPxqFBr6qqg6LuU7e/huHayzxf\nm8F+qaq2JvmrJM9rrX193v3pQWvt+tba/TP88uaDk9xrb6vNtlf7t6o6NskVrbVzJ2fvZVV13xgP\nb609MMPww5+vqkfOu0Md2JLkgUn+qLX2gCTXxpCIDbG/huFV/VQ0G+ryqrpTkox/r5hzf/ZLVXXz\nDEH4da21N4+z1X5GxrcsFzKM2T60qvZ8d7vnnOl7eJIfqaqLMgx9e1SGM8XqPgOttUvHv1ckeUuG\nF4GeazbWF5N8sbX2oXH6jAzhWN2nbH8Nw34qev7emuS48fpxSf5mjn3ZL43jJV+d5ILW2ismFqn9\nBqqqO1TVoeP1A5M8JsN47fck+bFxNXWfstbaC1trR7TWtmd4Tn93a+3pUfcNV1UHVdXBe64neWyS\nj8dzzYZqrX0pycVVdc9x1qOTfCLqPnX77Y9uVNXjM5w12PNT0S+fc5f2W1X1hiQ7k9w+yeVJXpLk\nr5O8Kcl3J/lCkie31pZ+yI51qKpHJPn7JOflO2MoX5Rh3LDab5Cqul+GD60ckOGEwptaa79RVXfN\ncMbysCQfSfKM1tq35tfT/VdV7UzyK621Y9V94401fss4uSXJ61trL6+q28VzzYaqqvtn+MDoLZJ8\nLsmzMj7vRN2nZr8NwwAAsJL9dZgEAACsSBgGAKBbwjAAAN0ShgEA6JYwDABAt4RhAAC6JQwDANAt\nYRgAgG79f47JGFAYwqEnAAAAAElFTkSuQmCC\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x7fb945a84438>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "plt.figure(figsize=(12,8))\n", | |
| "plt.title('Histogram of NA percentage, preprocessed dataset')\n", | |
| "\n", | |
| "(missing[missing > 0] * 100).hist();" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## All structural NA gone, try imputation strategies" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import sys\n", | |
| "sys.path.insert(0, '/home/parietal/jvandenb/scikit-learn/')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import sklearn" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['/home/parietal/jvandenb/scikit-learn/sklearn']" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sklearn.__path__" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.preprocessing import CategoricalEncoder\n", | |
| "from sklearn.preprocessing import StandardScaler\n", | |
| "from sklearn.impute import SimpleImputer, MICEImputer\n", | |
| "from sklearn.linear_model import RidgeCV\n", | |
| "from sklearn.model_selection import KFold\n", | |
| "from sklearn.model_selection import cross_val_score\n", | |
| "#from sklearn.compose import ColumnTransformer\n", | |
| "from sklearn.pipeline import Pipeline, make_pipeline" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Benchmark: mode/mean imputation" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# encoding methods\n", | |
| "#encoder_dict = {\n", | |
| "# 'one-hot': make_pipeline(SimpleImputer(strategy='most_frequent'),\n", | |
| "# CategoricalEncoder(handle_unknown='ignore', encoding='onehot-dense')),\n", | |
| "# 'num': SimpleImputer(strategy='mean')\n", | |
| "# }\n", | |
| "one_hot_cols = df.dtypes == 'category'\n", | |
| "num_cols = ~one_hot_cols\n", | |
| "encoder_type = {\n", | |
| " #'one-hot': one_hot_cols,\n", | |
| " 'num': num_cols\n", | |
| " }\n", | |
| "pipeline = Pipeline([\n", | |
| " # Use ColumnTransformer to combine the features\n", | |
| " ('preprocess', SimpleImputer(strategy='mean')),#ColumnTransformer(\n", | |
| " #[(e, encoder_dict[e], encoder_type[e]) for e in encoder_type])),\n", | |
| " ('scaler', StandardScaler(with_mean=True)),\n", | |
| " ('clf', RidgeCV())\n", | |
| " ])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "metadata": { | |
| "scrolled": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "r2 score: mean: 0.245; std: 0.015\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "cv = KFold(n_splits=3, random_state=12, shuffle=True)\n", | |
| "scoring = 'r2'\n", | |
| "scores = cross_val_score(pipeline, df, y, cv=cv, scoring=scoring, n_jobs=-1)\n", | |
| "print('%s score: mean: %.3f; std: %.3f\\n'\n", | |
| " % (scoring, np.mean(scores), np.std(scores)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Variable importance" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "pipeline.fit(df, y)\n", | |
| "clf = pipeline.named_steps['clf']\n", | |
| "ct = pipeline.named_steps['preprocess']\n", | |
| "cat = ct.named_transformers_['one-hot'].named_steps['categoricalencoder']\n", | |
| "cat_columns = []\n", | |
| "\n", | |
| "for col, categories in zip(df.columns[one_hot_cols], \n", | |
| " cat.categories_):\n", | |
| " for c in categories:\n", | |
| " cat_columns.append(col + '_' + str(int(c)))\n", | |
| "\n", | |
| "transformed_columns = df.columns[num_cols].tolist() + cat_columns\n", | |
| "coef = pd.Series(clf.coef_, index=transformed_columns)\n", | |
| "coef_sorted = coef[coef.abs().argsort()[::-1]]\n", | |
| "coef_sorted[:30].plot.bar(figsize=(20, 20));" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Better imputation..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# encoding methods\n", | |
| "encoder_dict = {\n", | |
| " 'one-hot': make_pipeline(MICEImputer(n_imputations=10),\n", | |
| " CategoricalEncoder(handle_unknown='ignore', encoding='onehot-dense')),\n", | |
| " 'num': MICEImputer(n_imputations=10)\n", | |
| " }\n", | |
| "one_hot_cols = df.dtypes == 'category'\n", | |
| "num_cols = ~one_hot_cols\n", | |
| "encoder_type = {\n", | |
| " 'one-hot': one_hot_cols,\n", | |
| " 'num': num_cols\n", | |
| " }\n", | |
| "pipeline = Pipeline([\n", | |
| " # Use ColumnTransformer to combine the features\n", | |
| " ('preprocess', MICEImputer(n_imputations=10)), #ColumnTransformer(\n", | |
| " #[(e, encoder_dict[e], encoder_type[e]) for e in encoder_type])),\n", | |
| " ('scaler', StandardScaler(with_mean=True)),\n", | |
| " ('clf', RidgeCV())\n", | |
| " ])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "pipeline.fit(df, y)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 47, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "KeyboardInterrupt", | |
| "evalue": "", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", | |
| "\u001b[0;32m<ipython-input-47-61ac726e4e58>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mcv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKFold\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_splits\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m12\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mscoring\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'r2'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcross_val_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpipeline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m print('%s score: mean: %.3f; std: %.3f\\n'\n\u001b[1;32m 24\u001b[0m % (scoring, np.mean(scores), np.std(scores)))\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36mcross_val_score\u001b[0;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0mfit_params\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 364\u001b[0;31m pre_dispatch=pre_dispatch)\n\u001b[0m\u001b[1;32m 365\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcv_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'test_score'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36mcross_validate\u001b[0;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator)\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0mfit_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_train_score\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 218\u001b[0m return_times=True, return_estimator=return_estimator)\n\u001b[0;32m--> 219\u001b[0;31m for train, test in cv.split(X, y, groups))\n\u001b[0m\u001b[1;32m 220\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0mzipped_scores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 777\u001b[0m \u001b[0;31m# was dispatched. In particular this covers the edge\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 778\u001b[0m \u001b[0;31m# case of Parallel used with an exhausted iterator.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 779\u001b[0;31m \u001b[0;32mwhile\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 780\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 624\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 625\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 626\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 586\u001b[0m \u001b[0mdispatch_timestamp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0mcb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBatchCompletionCallBack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdispatch_timestamp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 588\u001b[0;31m \u001b[0mjob\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 589\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 590\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[0;34m(self, func, callback)\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 111\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 112\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[0;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0;31m# arguments in memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 332\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 333\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 131\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 132\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 131\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 132\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[0;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/pipeline.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 246\u001b[0m \u001b[0mThis\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \"\"\"\n\u001b[0;32m--> 248\u001b[0;31m \u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfit_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 249\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_final_estimator\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 250\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_final_estimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/pipeline.py\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 211\u001b[0m Xt, fitted_transformer = fit_transform_one_cached(\n\u001b[1;32m 212\u001b[0m \u001b[0mcloned_transformer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mXt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 213\u001b[0;31m **fit_params_steps[name])\n\u001b[0m\u001b[1;32m 214\u001b[0m \u001b[0;31m# Replace the transformer of the step with the fitted\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0;31m# transformer. This is necessary when loading the transformer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/memory.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 362\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 363\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mcall_and_shelve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/pipeline.py\u001b[0m in \u001b[0;36m_fit_transform_one\u001b[0;34m(transformer, weight, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 588\u001b[0m **fit_params):\n\u001b[1;32m 589\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtransformer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'fit_transform'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 590\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransformer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 591\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 592\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransformer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtransform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/impute.py\u001b[0m in \u001b[0;36mfit_transform\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 781\u001b[0m X_filled, predictor = self._impute_one_feature(\n\u001b[1;32m 782\u001b[0m \u001b[0mX_filled\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmask_missing_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeat_idx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneighbor_feat_idx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 783\u001b[0;31m predictor=None, fit_mode=True)\n\u001b[0m\u001b[1;32m 784\u001b[0m predictor_triplet = MICETriplet(feat_idx,\n\u001b[1;32m 785\u001b[0m \u001b[0mneighbor_feat_idx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/impute.py\u001b[0m in \u001b[0;36m_impute_one_feature\u001b[0;34m(self, X_filled, mask_missing_values, feat_idx, neighbor_feat_idx, predictor, fit_mode)\u001b[0m\n\u001b[1;32m 533\u001b[0m y_train = safe_indexing(X_filled[:, feat_idx],\n\u001b[1;32m 534\u001b[0m ~missing_row_mask)\n\u001b[0;32m--> 535\u001b[0;31m \u001b[0mpredictor\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 536\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 537\u001b[0m \u001b[0;31m# get posterior samples\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/linear_model/bayes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 191\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 192\u001b[0m \u001b[0mXT_y\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 193\u001b[0;31m \u001b[0mU\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mS\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mVh\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlinalg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msvd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfull_matrices\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 194\u001b[0m \u001b[0meigen_vals_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mS\u001b[0m \u001b[0;34m**\u001b[0m \u001b[0;36m2\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/miniconda3/lib/python3.6/site-packages/scipy/linalg/decomp_svd.py\u001b[0m in \u001b[0;36msvd\u001b[0;34m(a, full_matrices, compute_uv, overwrite_a, check_finite, lapack_driver)\u001b[0m\n\u001b[1;32m 127\u001b[0m \u001b[0;31m# perform decomposition\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 128\u001b[0m u, s, v, info = gesXd(a1, compute_uv=compute_uv, lwork=lwork,\n\u001b[0;32m--> 129\u001b[0;31m full_matrices=full_matrices, overwrite_a=overwrite_a)\n\u001b[0m\u001b[1;32m 130\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minfo\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;31mKeyboardInterrupt\u001b[0m: " | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "cv = KFold(n_splits=3, random_state=12, shuffle=True)\n", | |
| "scoring = 'r2'\n", | |
| "scores = cross_val_score(pipeline, df, y, cv=cv, scoring=scoring, n_jobs=-1)\n", | |
| "print('%s score: mean: %.3f; std: %.3f\\n'\n", | |
| " % (scoring, np.mean(scores), np.std(scores)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "pipeline.fit(df, y)\n", | |
| "clf = pipeline.named_steps['clf']\n", | |
| "ct = pipeline.named_steps['preprocess']\n", | |
| "cat = ct.named_transformers_['one-hot'].named_steps['categoricalencoder']\n", | |
| "cat_columns = []\n", | |
| "\n", | |
| "for col, categories in zip(df.columns[one_hot_cols], \n", | |
| " cat.categories_):\n", | |
| " for c in categories:\n", | |
| " cat_columns.append(col + '_' + str(int(c)))\n", | |
| "\n", | |
| "transformed_columns = df.columns[num_cols].tolist() + cat_columns\n", | |
| "coef = pd.Series(clf.coef_, index=transformed_columns)\n", | |
| "coef_sorted = coef[coef.abs().argsort()[::-1]]\n", | |
| "coef_sorted[:30].plot.bar(figsize=(20, 20));" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## MICE with multiple imputations" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.pipeline import make_pipeline\n", | |
| "from sklearn.impute import MICEImputer" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/parietal/jvandenb/scikit-learn/sklearn/model_selection/_validation.py:501: FutureWarning: From version 0.22, errors during fit will result in a cross validation score of NaN by default. Use error_score='raise' if you want an exception raised or error_score=np.nan to adopt the behavior from version 0.22.\n", | |
| " FutureWarning)\n" | |
| ] | |
| }, | |
| { | |
| "ename": "AttributeError", | |
| "evalue": "'NoneType' object has no attribute 'fit'", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[0;32m<ipython-input-26-f878ea4fb059>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0mcv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mKFold\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mn_splits\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m12\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0mscoring\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'r2'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 44\u001b[0;31m \u001b[0mscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcross_val_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpipeline\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcv\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 45\u001b[0m print('%s score: mean: %.3f; std: %.3f\\n'\n\u001b[1;32m 46\u001b[0m % (scoring, np.mean(scores), np.std(scores)))\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36mcross_val_score\u001b[0;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)\u001b[0m\n\u001b[1;32m 362\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_jobs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 363\u001b[0m \u001b[0mfit_params\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 364\u001b[0;31m pre_dispatch=pre_dispatch)\n\u001b[0m\u001b[1;32m 365\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcv_results\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'test_score'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 366\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36mcross_validate\u001b[0;34m(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator)\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0mfit_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_train_score\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_train_score\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 218\u001b[0m return_times=True, return_estimator=return_estimator)\n\u001b[0;32m--> 219\u001b[0;31m for train, test in cv.split(X, y, groups))\n\u001b[0m\u001b[1;32m 220\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 221\u001b[0m \u001b[0mzipped_scores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mscores\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 777\u001b[0m \u001b[0;31m# was dispatched. In particular this covers the edge\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 778\u001b[0m \u001b[0;31m# case of Parallel used with an exhausted iterator.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 779\u001b[0;31m \u001b[0;32mwhile\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdispatch_one_batch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 780\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_iterating\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36mdispatch_one_batch\u001b[0;34m(self, iterator)\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 624\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 625\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dispatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtasks\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 626\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 627\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36m_dispatch\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 586\u001b[0m \u001b[0mdispatch_timestamp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 587\u001b[0m \u001b[0mcb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBatchCompletionCallBack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdispatch_timestamp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 588\u001b[0;31m \u001b[0mjob\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbatch\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 589\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 590\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mapply_async\u001b[0;34m(self, func, callback)\u001b[0m\n\u001b[1;32m 109\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0;34m\"\"\"Schedule a func to be run\"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 111\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mImmediateResult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 112\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, batch)\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[0;31m# Don't delay the application, to avoid keeping the input\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0;31m# arguments in memory\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 332\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 333\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 131\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 132\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/externals/joblib/parallel.py\u001b[0m in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 130\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 131\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 132\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 133\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__len__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m/home/parietal/jvandenb/scikit-learn/sklearn/model_selection/_validation.py\u001b[0m in \u001b[0;36m_fit_and_score\u001b[0;34m(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)\u001b[0m\n\u001b[1;32m 485\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 486\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 487\u001b[0;31m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 488\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 489\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;32m<ipython-input-26-f878ea4fb059>\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mn_predictions\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mX_new\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbase_imputer\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbase_classifier\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_new\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'fit'" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from sklearn.base import BaseEstimator\n", | |
| "\n", | |
| "encoder_dict = {\n", | |
| " 'one-hot': CategoricalEncoder(handle_unknown='ignore', encoding='onehot-dense')\n", | |
| " }\n", | |
| "one_hot_cols = df.dtypes == 'category'\n", | |
| "num_cols = ~one_hot_cols\n", | |
| "encoder_type = {\n", | |
| " 'one-hot': one_hot_cols\n", | |
| " }\n", | |
| "\n", | |
| "class MICEMultipleImputerPredictions(BaseEstimator):\n", | |
| " \n", | |
| " def __init__(self, clf, n_predictions = 1):\n", | |
| " self.n_predictions = n_predictions\n", | |
| " self.prediction = None\n", | |
| " self.base_imputer = [MICEImputer(n_imputations=1)] * n_predictions\n", | |
| " self.base_classifier = [clf] * n_predictions\n", | |
| " \n", | |
| " def fit(self, X, y):\n", | |
| " for i in range(self.n_predictions):\n", | |
| " X_new = self.base_imputer[i].fit_transform(X)\n", | |
| " self.base_classifier[i].fit(X_new, y)\n", | |
| " \n", | |
| " def predict(self, X):\n", | |
| " self.prediction = np.zeros(X.shape[0])\n", | |
| " \n", | |
| " for i in range(self.n_predictions):\n", | |
| " X_new = self.base_imputer.transform(X)\n", | |
| " self.prediction = np.add(self.prediction, self.base_classifier[i].predict(X_new))\n", | |
| " \n", | |
| " self.prediction /= n_predictions\n", | |
| " return self.prediction\n", | |
| "\n", | |
| "clf = make_pipeline(\n", | |
| " ColumnTransformer([(e, encoder_dict[e], encoder_type[e]) for e in encoder_type]),\n", | |
| " StandardScaler(with_mean=True),\n", | |
| " RidgeCV()\n", | |
| ")\n", | |
| "pipeline = MICEMultipleImputerPredictions(clf, n_predictions=10)\n", | |
| "\n", | |
| "cv = KFold(n_splits=3, random_state=12, shuffle=True)\n", | |
| "scoring = 'r2'\n", | |
| "scores = cross_val_score(pipeline, df, y, cv=cv, scoring=scoring)\n", | |
| "print('%s score: mean: %.3f; std: %.3f\\n'\n", | |
| " % (scoring, np.mean(scores), np.std(scores)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### Multiple imputation?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "\"\"\"\n", | |
| "The :mod:`sklearn.compose._column_transformer` module implements utilities\n", | |
| "to work with heterogeneous data and to apply different transformers to\n", | |
| "different columns.\n", | |
| "\"\"\"\n", | |
| "# Author: Andreas Mueller\n", | |
| "# Joris Van den Bossche\n", | |
| "# License: BSD\n", | |
| "\n", | |
| "\n", | |
| "import numpy as np\n", | |
| "from scipy import sparse\n", | |
| "\n", | |
| "from sklearn.base import clone, TransformerMixin\n", | |
| "from sklearn.externals.joblib import Parallel, delayed\n", | |
| "from sklearn.externals import six\n", | |
| "from sklearn.pipeline import (\n", | |
| " _fit_one_transformer, _fit_transform_one, _transform_one, _name_estimators)\n", | |
| "from sklearn.preprocessing import FunctionTransformer\n", | |
| "from sklearn.utils import Bunch\n", | |
| "from sklearn.utils.metaestimators import _BaseComposition\n", | |
| "from sklearn.utils.validation import check_is_fitted\n", | |
| "\n", | |
| "\n", | |
| "__all__ = ['ColumnTransformer', 'make_column_transformer']\n", | |
| "\n", | |
| "\n", | |
| "_ERR_MSG_1DCOLUMN = (\"1D data passed to a transformer that expects 2D data. \"\n", | |
| " \"Try to specify the column selection as a list of one \"\n", | |
| " \"item instead of a scalar.\")\n", | |
| "\n", | |
| "\n", | |
| "class ColumnTransformer(_BaseComposition, TransformerMixin):\n", | |
| " \"\"\"Applies transformers to columns of an array or pandas DataFrame.\n", | |
| "\n", | |
| " EXPERIMENTAL: some behaviors may change between releases without\n", | |
| " deprecation.\n", | |
| "\n", | |
| " This estimator allows different columns or column subsets of the input\n", | |
| " to be transformed separately and the results combined into a single\n", | |
| " feature space.\n", | |
| " This is useful for heterogeneous or columnar data, to combine several\n", | |
| " feature extraction mechanisms or transformations into a single transformer.\n", | |
| "\n", | |
| " Read more in the :ref:`User Guide <column_transformer>`.\n", | |
| "\n", | |
| " .. versionadded:: 0.20\n", | |
| "\n", | |
| " Parameters\n", | |
| " ----------\n", | |
| " transformers : list of tuples\n", | |
| " List of (name, transformer, column) tuples specifying the transformer\n", | |
| " objects to be applied to subsets of the data.\n", | |
| "\n", | |
| " name : string\n", | |
| " Like in FeatureUnion and Pipeline, this allows the transformer and\n", | |
| " its parameters to be set using ``set_params`` and searched in grid\n", | |
| " search.\n", | |
| " transformer : estimator or {'passthrough', 'drop'}\n", | |
| " Estimator must support `fit` and `transform`. Special-cased\n", | |
| " strings 'drop' and 'passthrough' are accepted as well, to\n", | |
| " indicate to drop the columns or to pass them through untransformed,\n", | |
| " respectively.\n", | |
| " column : string or int, array-like of string or int, slice or boolean \\\n", | |
| "mask array\n", | |
| " Indexes the data on its second axis. Integers are interpreted as\n", | |
| " positional columns, while strings can reference DataFrame columns\n", | |
| " by name. A scalar string or int should be used where\n", | |
| " ``transformer`` expects X to be a 1d array-like (vector),\n", | |
| " otherwise a 2d array will be passed to the transformer.\n", | |
| "\n", | |
| " unspecified : {'passthrough', 'drop'}, default 'drop'\n", | |
| " By default, only the specified columns in `transformers` are\n", | |
| " transformed and combined in the output (default of ``'drop'``).\n", | |
| " By specifying ``unspecified='passthrough'``, all remaining columns that\n", | |
| " were not specified in `transformers` will be automatically passed\n", | |
| " through. This subset of columns is concatenated with the output of\n", | |
| " the transformers.\n", | |
| "\n", | |
| " n_jobs : int, optional\n", | |
| " Number of jobs to run in parallel (default 1).\n", | |
| "\n", | |
| " transformer_weights : dict, optional\n", | |
| " Multiplicative weights for features per transformer. The output of the\n", | |
| " transformer is multiplied by these weights. Keys are transformer names,\n", | |
| " values the weights.\n", | |
| "\n", | |
| " Attributes\n", | |
| " ----------\n", | |
| " transformers_ : list\n", | |
| " The collection of fitted transformers as tuples of\n", | |
| " (name, fitted_transformer, column).\n", | |
| "\n", | |
| " named_transformers_ : Bunch object, a dictionary with attribute access\n", | |
| " Read-only attribute to access any transformer by given name.\n", | |
| " Keys are transformer names and values are the fitted transformer\n", | |
| " objects.\n", | |
| "\n", | |
| " Notes\n", | |
| " -----\n", | |
| " The order of the columns in the transformed feature matrix follows the\n", | |
| " order of how the columns are specified in the `transformers` list.\n", | |
| " Columns of the original feature matrix that are not specified are\n", | |
| " dropped from the resulting transformed feature matrix, unless specified\n", | |
| " in the `passthrough` keyword. Those columns specified with `passthrough`\n", | |
| " are added at the right to the output of the transformers.\n", | |
| "\n", | |
| " Examples\n", | |
| " --------\n", | |
| " >>> from sklearn.compose import ColumnTransformer\n", | |
| " >>> from sklearn.preprocessing import Normalizer\n", | |
| " >>> ct = ColumnTransformer(\n", | |
| " ... [(\"norm1\", Normalizer(norm='l1'), [0, 1]),\n", | |
| " ... (\"norm2\", Normalizer(norm='l1'), slice(2, 4))])\n", | |
| " >>> X = np.array([[0., 1., 2., 2.],\n", | |
| " ... [1., 1., 0., 1.]])\n", | |
| " >>> # Normalizer scales each row of X to unit norm. Therefore, a separate\n", | |
| " >>> # scaling is applied for the two first and two last elements of each\n", | |
| " >>> # row independently.\n", | |
| " >>> ct.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE\n", | |
| " array([[0. , 1. , 0.5, 0.5],\n", | |
| " [0.5, 0.5, 0. , 1. ]])\n", | |
| "\n", | |
| " \"\"\"\n", | |
| "\n", | |
| " def __init__(self, transformers, unspecified='drop', n_jobs=1,\n", | |
| " transformer_weights=None):\n", | |
| " self.transformers = transformers\n", | |
| " self.unspecified = unspecified\n", | |
| " self.n_jobs = n_jobs\n", | |
| " self.transformer_weights = transformer_weights\n", | |
| "\n", | |
| " @property\n", | |
| " def _transformers(self):\n", | |
| " \"\"\"\n", | |
| " Internal list of transformer only containing the name and\n", | |
| " transformers, dropping the columns. This is for the implementation\n", | |
| " of get_params via BaseComposition._get_params which expects lists\n", | |
| " of tuples of len 2.\n", | |
| " \"\"\"\n", | |
| " return [(name, trans) for name, trans, _ in self.transformers]\n", | |
| "\n", | |
| " @_transformers.setter\n", | |
| " def _transformers(self, value):\n", | |
| " self.transformers = [\n", | |
| " (name, trans, col) for ((name, trans), (_, _, col))\n", | |
| " in zip(value, self.transformers)]\n", | |
| "\n", | |
| " def get_params(self, deep=True):\n", | |
| " \"\"\"Get parameters for this estimator.\n", | |
| "\n", | |
| " Parameters\n", | |
| " ----------\n", | |
| " deep : boolean, optional\n", | |
| " If True, will return the parameters for this estimator and\n", | |
| " contained subobjects that are estimators.\n", | |
| "\n", | |
| " Returns\n", | |
| " -------\n", | |
| " params : mapping of string to any\n", | |
| " Parameter names mapped to their values.\n", | |
| " \"\"\"\n", | |
| " return self._get_params('_transformers', deep=deep)\n", | |
| "\n", | |
| " def set_params(self, **kwargs):\n", | |
| " \"\"\"Set the parameters of this estimator.\n", | |
| "\n", | |
| " Valid parameter keys can be listed with ``get_params()``.\n", | |
| "\n", | |
| " Returns\n", | |
| " -------\n", | |
| " self\n", | |
| " \"\"\"\n", | |
| " self._set_params('_transformers', **kwargs)\n", | |
| " return self\n", | |
| "\n", | |
| " def _iter(self, X=None, fitted=False, replace_strings=False):\n", | |
| " \"\"\"Generate (name, trans, column, weight) tuples\n", | |
| " \"\"\"\n", | |
| " if fitted:\n", | |
| " transformers = self.transformers_\n", | |
| " else:\n", | |
| " transformers = self.transformers\n", | |
| " get_weight = (self.transformer_weights or {}).get\n", | |
| "\n", | |
| " for name, trans, column in transformers:\n", | |
| " if X is None:\n", | |
| " sub = X\n", | |
| " else:\n", | |
| " sub = _get_column(X, column)\n", | |
| "\n", | |
| " if replace_strings:\n", | |
| " # replace 'passthrough' with identity transformer and\n", | |
| " # skip in case of 'drop'\n", | |
| " if trans == 'passthrough':\n", | |
| " trans = FunctionTransformer(\n", | |
| " validate=False, accept_sparse=True,\n", | |
| " check_inverse=False)\n", | |
| " elif trans == 'drop':\n", | |
| " continue\n", | |
| "\n", | |
| " yield (name, trans, sub, get_weight(name))\n", | |
| "\n", | |
| " def _validate_transformers(self):\n", | |
| " names, transformers, _, _ = zip(*self._iter())\n", | |
| "\n", | |
| " # validate names\n", | |
| " self._validate_names(names)\n", | |
| "\n", | |
| " # validate estimators\n", | |
| " for t in transformers:\n", | |
| " if t in ('drop', 'passthrough'):\n", | |
| " continue\n", | |
| " if (not (hasattr(t, \"fit\") or hasattr(t, \"fit_transform\")) or not\n", | |
| " hasattr(t, \"transform\")):\n", | |
| " raise TypeError(\"All estimators should implement fit and \"\n", | |
| " \"transform, or can be 'drop' or 'passthrough' \"\n", | |
| " \"specifiers. '%s' (type %s) doesn't.\" %\n", | |
| " (t, type(t)))\n", | |
| "\n", | |
| " def _validate_unspecified(self, X):\n", | |
| " \"\"\"Generate list of passthrough columns for 'unspecified' case.\n", | |
| " \"\"\"\n", | |
| " if self.unspecified not in ('drop', 'passthrough'):\n", | |
| " raise ValueError(\n", | |
| " \"The unspecified keywords needs to be one of 'drop' or \"\n", | |
| " \"'passthrough'. {0:r} was passed instead\")\n", | |
| "\n", | |
| " n_columns = X.shape[1]\n", | |
| "\n", | |
| " if self.unspecified == 'passthrough':\n", | |
| " cols = []\n", | |
| " for _, _, columns in self.transformers:\n", | |
| " cols.extend(_get_column_indices(X, columns))\n", | |
| " self._passthrough = sorted(list(set(range(n_columns)) - set(cols)))\n", | |
| " else:\n", | |
| " self._passthrough = None\n", | |
| "\n", | |
| " @property\n", | |
| " def named_transformers_(self):\n", | |
| " \"\"\"Access the fitted transformer by name.\n", | |
| "\n", | |
| " Read-only attribute to access any transformer by given name.\n", | |
| " Keys are transformer names and values are the fitted transformer\n", | |
| " objects.\n", | |
| "\n", | |
| " \"\"\"\n", | |
| " # Use Bunch object to improve autocomplete\n", | |
| " return Bunch(**dict([(name, trans) for name, trans, _\n", | |
| " in self.transformers_]))\n", | |
| "\n", | |
| " def get_feature_names(self):\n", | |
| " \"\"\"Get feature names from all transformers.\n", | |
| "\n", | |
| " Returns\n", | |
| " -------\n", | |
| " feature_names : list of strings\n", | |
| " Names of the features produced by transform.\n", | |
| " \"\"\"\n", | |
| " check_is_fitted(self, 'transformers_')\n", | |
| " feature_names = []\n", | |
| " for name, trans, _, _ in self._iter(fitted=True):\n", | |
| " if trans == 'drop':\n", | |
| " continue\n", | |
| " if not hasattr(trans, 'get_feature_names'):\n", | |
| " raise AttributeError(\"Transformer %s (type %s) does not \"\n", | |
| " \"provide get_feature_names.\"\n", | |
| " % (str(name), type(trans).__name__))\n", | |
| " feature_names.extend([name + \"__\" + f for f in\n", | |
| " trans.get_feature_names()])\n", | |
| " return feature_names\n", | |
| "\n", | |
| " def _update_fitted_transformers(self, transformers):\n", | |
| " transformers = iter(transformers)\n", | |
| " transformers_ = []\n", | |
| "\n", | |
| " for name, old, column in self.transformers:\n", | |
| " if old == 'drop':\n", | |
| " trans = old\n", | |
| " elif old == 'passthrough':\n", | |
| " # FunctionTransformer is present in list of transformers,\n", | |
| " # so get next transformer, but save original string\n", | |
| " next(transformers)\n", | |
| " trans = old\n", | |
| " else:\n", | |
| " trans = next(transformers)\n", | |
| "\n", | |
| " transformers_.append((name, trans, column))\n", | |
| "\n", | |
| " self.transformers_ = transformers_\n", | |
| "\n", | |
| " def _fit_transform(self, X, y, func, fitted=False):\n", | |
| " \"\"\"\n", | |
| " Private function to fit and/or transform on demand.\n", | |
| "\n", | |
| " Return value (transformers and/or transformed X data) depends\n", | |
| " on the passed function.\n", | |
| " ``fitted=True`` ensures the fitted transformers are used.\n", | |
| " \"\"\"\n", | |
| " try:\n", | |
| " return Parallel(n_jobs=self.n_jobs)(\n", | |
| " delayed(func)(clone(trans) if not fitted else trans,\n", | |
| " X_sel, y, weight)\n", | |
| " for name, trans, X_sel, weight in self._iter(\n", | |
| " X=X, fitted=fitted, replace_strings=True))\n", | |
| " except ValueError as e:\n", | |
| " if \"Expected 2D array, got 1D array instead\" in str(e):\n", | |
| " raise ValueError(_ERR_MSG_1DCOLUMN)\n", | |
| " else:\n", | |
| " raise\n", | |
| "\n", | |
| " def fit(self, X, y=None):\n", | |
| " \"\"\"Fit all transformers using X.\n", | |
| "\n", | |
| " Parameters\n", | |
| " ----------\n", | |
| " X : array-like or DataFrame of shape [n_samples, n_features]\n", | |
| " Input data, of which specified subsets are used to fit the\n", | |
| " transformers.\n", | |
| "\n", | |
| " y : array-like, shape (n_samples, ...), optional\n", | |
| " Targets for supervised learning.\n", | |
| "\n", | |
| " Returns\n", | |
| " -------\n", | |
| " self : ColumnTransformer\n", | |
| " This estimator\n", | |
| "\n", | |
| " \"\"\"\n", | |
| " self._validate_transformers()\n", | |
| " self._validate_unspecified(X)\n", | |
| "\n", | |
| " transformers = self._fit_transform(X, y, _fit_one_transformer)\n", | |
| "\n", | |
| " self._update_fitted_transformers(transformers)\n", | |
| " return self\n", | |
| "\n", | |
| " def fit_transform(self, X, y=None):\n", | |
| " \"\"\"Fit all transformers, transform the data and concatenate results.\n", | |
| "\n", | |
| " Parameters\n", | |
| " ----------\n", | |
| " X : array-like or DataFrame of shape [n_samples, n_features]\n", | |
| " Input data, of which specified subsets are used to fit the\n", | |
| " transformers.\n", | |
| "\n", | |
| " y : array-like, shape (n_samples, ...), optional\n", | |
| " Targets for supervised learning.\n", | |
| "\n", | |
| " Returns\n", | |
| " -------\n", | |
| " X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)\n", | |
| " hstack of results of transformers. sum_n_components is the\n", | |
| " sum of n_components (output dimension) over transformers. If\n", | |
| " any result is a sparse matrix, everything will be converted to\n", | |
| " sparse matrices.\n", | |
| "\n", | |
| " \"\"\"\n", | |
| " self._validate_transformers()\n", | |
| " self._validate_unspecified(X)\n", | |
| "\n", | |
| " result = self._fit_transform(X, y, _fit_transform_one)\n", | |
| "\n", | |
| " if not result:\n", | |
| " # All transformers are None\n", | |
| " if self._passthrough is None:\n", | |
| " return np.zeros((X.shape[0], 0))\n", | |
| " else:\n", | |
| " return _get_column(X, self._passthrough)\n", | |
| "\n", | |
| " Xs, transformers = zip(*result)\n", | |
| "\n", | |
| " self._update_fitted_transformers(transformers)\n", | |
| "\n", | |
| " if self._passthrough is not None:\n", | |
| " Xs = list(Xs) + [_get_column(X, self._passthrough)]\n", | |
| "\n", | |
| " if any(sparse.issparse(f) for f in Xs):\n", | |
| " Xs = sparse.hstack(Xs).tocsr()\n", | |
| " else:\n", | |
| " Xs = np.hstack(Xs)\n", | |
| " return Xs\n", | |
| "\n", | |
| " def transform(self, X):\n", | |
| " \"\"\"Transform X separately by each transformer, concatenate results.\n", | |
| "\n", | |
| " Parameters\n", | |
| " ----------\n", | |
| " X : array-like or DataFrame of shape [n_samples, n_features]\n", | |
| " The data to be transformed by subset.\n", | |
| "\n", | |
| " Returns\n", | |
| " -------\n", | |
| " X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)\n", | |
| " hstack of results of transformers. sum_n_components is the\n", | |
| " sum of n_components (output dimension) over transformers. If\n", | |
| " any result is a sparse matrix, everything will be converted to\n", | |
| " sparse matrices.\n", | |
| "\n", | |
| " \"\"\"\n", | |
| " check_is_fitted(self, 'transformers_')\n", | |
| "\n", | |
| " Xs = self._fit_transform(X, None, _transform_one, fitted=True)\n", | |
| "\n", | |
| " if not Xs:\n", | |
| " # All transformers are None\n", | |
| " if self._passthrough is None:\n", | |
| " return np.zeros((X.shape[0], 0))\n", | |
| " else:\n", | |
| " return _get_column(X, self._passthrough)\n", | |
| "\n", | |
| " if self._passthrough is not None:\n", | |
| " Xs = list(Xs) + [_get_column(X, self._passthrough)]\n", | |
| "\n", | |
| " if any(sparse.issparse(f) for f in Xs):\n", | |
| " Xs = sparse.hstack(Xs).tocsr()\n", | |
| " else:\n", | |
| " Xs = np.hstack(Xs)\n", | |
| " return Xs\n", | |
| "\n", | |
| "\n", | |
| "def _check_key_type(key, superclass):\n", | |
| " \"\"\"\n", | |
| " Check that scalar, list or slice is of certain type.\n", | |
| "\n", | |
| " \"\"\"\n", | |
| " if isinstance(key, superclass):\n", | |
| " return True\n", | |
| " if isinstance(key, slice):\n", | |
| " return (isinstance(key.start, (superclass, type(None))) and\n", | |
| " isinstance(key.stop, (superclass, type(None))))\n", | |
| " if isinstance(key, list):\n", | |
| " return all(isinstance(x, superclass) for x in key)\n", | |
| " if hasattr(key, 'dtype'):\n", | |
| " return key.dtype.kind == 'i'\n", | |
| " return False\n", | |
| "\n", | |
| "\n", | |
| "def _get_column(X, key):\n", | |
| " \"\"\"\n", | |
| " Get feature column(s) from input data X.\n", | |
| "\n", | |
| " Supported input types (X): numpy arrays, sparse arrays and DataFrames\n", | |
| "\n", | |
| " Supported key types (key):\n", | |
| " - scalar: output is 1D\n", | |
| " - lists, slices, boolean masks: output is 2D\n", | |
| "\n", | |
| " Supported key data types:\n", | |
| "\n", | |
| " - integer or boolean mask (positional):\n", | |
| " - supported for arrays, sparse matrices and dataframes\n", | |
| " - string (key-based):\n", | |
| " - only supported for dataframes\n", | |
| " - So no keys other than strings are allowed (while in principle you\n", | |
| " can use any hashable object as key).\n", | |
| "\n", | |
| " \"\"\"\n", | |
| " # check whether we have string column names or integers\n", | |
| " if _check_key_type(key, int):\n", | |
| " column_names = False\n", | |
| " elif _check_key_type(key, six.string_types):\n", | |
| " column_names = True\n", | |
| " elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool):\n", | |
| " # boolean mask\n", | |
| " column_names = False\n", | |
| " if hasattr(X, 'loc'):\n", | |
| " # pandas boolean masks don't work with iloc, so take loc path\n", | |
| " column_names = True\n", | |
| " else:\n", | |
| " raise ValueError(\"No valid specification of the columns. Only a \"\n", | |
| " \"scalar, list or slice of all integers or all \"\n", | |
| " \"strings, or boolean mask is allowed\")\n", | |
| "\n", | |
| " if column_names:\n", | |
| " if hasattr(X, 'loc'):\n", | |
| " # pandas dataframes\n", | |
| " return X.loc[:, key]\n", | |
| " else:\n", | |
| " raise ValueError(\"Specifying the columns using strings is only \"\n", | |
| " \"supported for pandas DataFrames\")\n", | |
| " else:\n", | |
| " if hasattr(X, 'iloc'):\n", | |
| " # pandas dataframes\n", | |
| " return X.iloc[:, key]\n", | |
| " else:\n", | |
| " # numpy arrays, sparse arrays\n", | |
| " return X[:, key]\n", | |
| "\n", | |
| "\n", | |
| "def _get_column_indices(X, key):\n", | |
| " \"\"\"\n", | |
| " Get feature column indices for input data X and key.\n", | |
| "\n", | |
| " For accepted values of `key`, see the docstring of _get_column\n", | |
| "\n", | |
| " \"\"\"\n", | |
| " n_columns = X.shape[1]\n", | |
| "\n", | |
| " if _check_key_type(key, int):\n", | |
| " if isinstance(key, int):\n", | |
| " return [key]\n", | |
| " elif isinstance(key, slice):\n", | |
| " return list(range(n_columns)[key])\n", | |
| " else:\n", | |
| " return list(key)\n", | |
| "\n", | |
| " elif _check_key_type(key, six.string_types):\n", | |
| " all_columns = list(X.columns)\n", | |
| " if isinstance(key, six.string_types):\n", | |
| " columns = [key]\n", | |
| " elif isinstance(key, slice):\n", | |
| " start, stop = key.start, key.stop\n", | |
| " if start is not None:\n", | |
| " start = all_columns.index(start)\n", | |
| " if stop is not None:\n", | |
| " # pandas indexing with strings is endpoint included\n", | |
| " stop = all_columns.index(stop) + 1\n", | |
| " else:\n", | |
| " stop = n_columns + 1\n", | |
| " return list(range(n_columns)[slice(start, stop)])\n", | |
| " else:\n", | |
| " columns = list(key)\n", | |
| "\n", | |
| " return [all_columns.index(col) for col in columns]\n", | |
| "\n", | |
| " elif hasattr(key, 'dtype') and np.issubdtype(key.dtype, np.bool):\n", | |
| " # boolean mask\n", | |
| " return list(np.arange(n_columns)[key])\n", | |
| " else:\n", | |
| " raise ValueError(\"No valid specification of the columns. Only a \"\n", | |
| " \"scalar, list or slice of all integers or all \"\n", | |
| " \"strings, or boolean mask is allowed\")\n", | |
| "\n", | |
| "\n", | |
| "def _get_transformer_list(estimators):\n", | |
| " \"\"\"\n", | |
| " Construct (name, trans, column) tuples from list\n", | |
| "\n", | |
| " \"\"\"\n", | |
| " transformers = [trans[1] for trans in estimators]\n", | |
| " columns = [trans[0] for trans in estimators]\n", | |
| " names = [trans[0] for trans in _name_estimators(transformers)]\n", | |
| "\n", | |
| " transformer_list = list(zip(names, transformers, columns))\n", | |
| " return transformer_list\n", | |
| "\n", | |
| "\n", | |
| "def make_column_transformer(*transformers, **kwargs):\n", | |
| " \"\"\"Construct a ColumnTransformer from the given transformers.\n", | |
| "\n", | |
| " This is a shorthand for the ColumnTransformer constructor; it does not\n", | |
| " require, and does not permit, naming the transformers. Instead, they will\n", | |
| " be given names automatically based on their types. It also does not allow\n", | |
| " weighting.\n", | |
| "\n", | |
| " Parameters\n", | |
| " ----------\n", | |
| " *transformers : tuples of column selections and transformers\n", | |
| "\n", | |
| " n_jobs : int, optional\n", | |
| " Number of jobs to run in parallel (default 1).\n", | |
| "\n", | |
| " Returns\n", | |
| " -------\n", | |
| " ct : ColumnTransformer\n", | |
| "\n", | |
| " Examples\n", | |
| " --------\n", | |
| " >>> from sklearn.preprocessing import StandardScaler, CategoricalEncoder\n", | |
| " >>> from sklearn.compose import make_column_transformer\n", | |
| " >>> make_column_transformer(\n", | |
| " ... (['numerical_column'], StandardScaler()),\n", | |
| " ... (['categorical_column'], CategoricalEncoder()))\n", | |
| " ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS\n", | |
| " ColumnTransformer(n_jobs=1, transformer_weights=None,\n", | |
| " transformers=[('standardscaler',\n", | |
| " StandardScaler(...),\n", | |
| " ['numerical_column']),\n", | |
| " ('categoricalencoder',\n", | |
| " CategoricalEncoder(...),\n", | |
| " ['categorical_column'])],\n", | |
| " unspecified='drop')\n", | |
| "\n", | |
| " \"\"\"\n", | |
| " n_jobs = kwargs.pop('n_jobs', 1)\n", | |
| " if kwargs:\n", | |
| " raise TypeError('Unknown keyword arguments: \"{}\"'\n", | |
| " .format(list(kwargs.keys())[0]))\n", | |
| " transformer_list = _get_transformer_list(transformers)\n", | |
| " return ColumnTransformer(transformer_list, n_jobs=n_jobs)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment