fonnesbeck · September 24, 2018 21:41
diff --git a/Control Sampling.ipynb b/Control Sampling.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import data from REDCap"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from redcap import Project\n",
    "\n",
    "api_url = 'https://redcap.vanderbilt.edu/api/'\n",
    "\n",
    "newborns_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/newborns_api_key.txt\").read()\n",
    "newborns_proj = Project(api_url, newborns_key)\n",
    "newborns = newborns_proj.export_records(format='df', df_kwargs={'index_col': newborns_proj.field_names[0]})\n",
    "\n",
    "hospitalized_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt\").read()\n",
    "hospitalized_proj = Project(api_url, hospitalized_key)\n",
    "hospitalized = hospitalized_proj.export_records(format='df', df_kwargs={'index_col': hospitalized_proj.field_names[0]})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "hospitalized_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt\").read()\n",
    "hospitalized_proj = Project(api_url, hospitalized_key)\n",
    "hospitalized_proj.metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(14093, 125)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newborns.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3175, 338)"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospitalized.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Hospitalized with missing ID numbers:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index([u'D3543'], dtype=object)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospitalized[(hospitalized.newborn_id==1) & (hospitalized.newborn_id_number.isnull())].index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create `season` variable for newborns:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "case_id\n",
       "A0001      2010-02-02\n",
       "A0002      2010-01-29\n",
       "A0003      2009-03-28\n",
       "Name: child_birth_date, dtype: object"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospitalized.child_birth_date[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "case_id_newborn\n",
       "B1098              2010-11-22\n",
       "B1545              2010-06-27\n",
       "C2779              2010-12-20\n",
       "Name: birth_date_newborn, dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newborns.birth_date_newborn[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#hospitalized['birth_date'] = pd.to_datetime(hospitalized.child_birth_date, errors='raise', format='%Y-%m-%d')\n",
    "hospitalized['birth_date'] = [pd.to_datetime(d) for d in hospitalized.child_birth_date]\n",
    "#newborns['birth_date'] = pd.to_datetime(newborns.birth_date_newborn, errors='raise')\n",
    "newborns['birth_date'] = [pd.to_datetime(d) for d in newborns.birth_date_newborn]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count                    3175\n",
       "unique                   1185\n",
       "first     2008-05-20 00:00:00\n",
       "last      2013-12-20 00:00:00\n",
       "top       2011-12-14 00:00:00\n",
       "freq                       13\n",
       "dtype: object"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospitalized.birth_date.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count                   14090\n",
       "unique                    967\n",
       "first     2010-01-29 00:00:00\n",
       "last      2013-01-31 00:00:00\n",
       "top       2011-08-24 00:00:00\n",
       "freq                       38\n",
       "dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newborns.birth_date.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Identify and remove bogus dates:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Series([], dtype: datetime64[ns])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospitalized.birth_date[[type(d)!=pd.tslib.Timestamp for d in hospitalized.birth_date]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "case_id_newborn\n",
       "B1098             2010-11-22 00:00:00\n",
       "B1545             2010-06-27 00:00:00\n",
       "C2779             2010-12-20 00:00:00\n",
       "E`10602           2012-06-11 00:00:00\n",
       "E0001             2010-01-30 00:00:00\n",
       "E0002             2010-01-29 00:00:00\n",
       "E0003             2010-01-30 00:00:00\n",
       "E0004             2010-01-30 00:00:00\n",
       "E0005             2010-01-31 00:00:00\n",
       "E0006             2010-01-31 00:00:00\n",
       "E0007             2010-02-01 00:00:00\n",
       "E0008             2010-02-01 00:00:00\n",
       "E0009             2010-02-02 00:00:00\n",
       "E0010             2010-02-02 00:00:00\n",
       "E0011             2010-02-06 00:00:00\n",
       "...\n",
       "H7985             2011-10-24 00:00:00\n",
       "H7986             2011-10-24 00:00:00\n",
       "H7987             2011-10-24 00:00:00\n",
       "H7988             2011-10-24 00:00:00\n",
       "H7989             2011-10-25 00:00:00\n",
       "H7990             2011-10-25 00:00:00\n",
       "H7991             2011-10-25 00:00:00\n",
       "H7992             2011-10-25 00:00:00\n",
       "H7993             2011-10-25 00:00:00\n",
       "H7994             2011-10-25 00:00:00\n",
       "H7995             2011-10-25 00:00:00\n",
       "H7996             2011-10-27 00:00:00\n",
       "H7997             2011-10-26 00:00:00\n",
       "H7998             2011-10-26 00:00:00\n",
       "H7999             2011-10-26 00:00:00\n",
       "Name: birth_date, Length: 14093, dtype: datetime64[ns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newborns.birth_date[[type(d)!=datetime.datetime for d in newborns.birth_date]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Timestamp('2008-05-20 00:00:00', tz=None),\n",
       " Timestamp('2013-12-20 00:00:00', tz=None))"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "min(hospitalized.birth_date[hospitalized.birth_date.notnull()]), \\\n",
    "max(hospitalized.birth_date[hospitalized.birth_date.notnull()])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Timestamp('2010-01-29 00:00:00', tz=None),\n",
       " Timestamp('2013-01-31 00:00:00', tz=None))"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newborns.birth_date[newborns.birth_date.notnull()].min(), \\\n",
    "newborns.birth_date[newborns.birth_date.notnull()].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(Timestamp('2010-01-29 00:00:00', tz=None),\n",
       " Timestamp('2013-01-31 00:00:00', tz=None))"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newborns.birth_date[newborns.birth_date.notnull()].min(), \\\n",
    "newborns.birth_date[newborns.birth_date.notnull()].max()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Remove records with null birth dates:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "newborns_complete = newborns[newborns.birth_date.notnull()]\n",
    "hospitalized_complete = hospitalized[hospitalized.birth_date.notnull() & hospitalized.newborn_id_number.notnull()]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create season-by-year variable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "newborns_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \\\n",
    "    (b.month in [6,7,8])*3 or 4) for b in newborns_complete.birth_date]\n",
    "hospitalized_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \\\n",
    "    (b.month in [6,7,8])*3 or 4) for b in hospitalized_complete.birth_date]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "List of all study seasons:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(2010, 1),\n",
       " (2010, 2),\n",
       " (2010, 3),\n",
       " (2010, 4),\n",
       " (2011, 1),\n",
       " (2011, 2),\n",
       " (2011, 3),\n",
       " (2011, 4),\n",
       " (2012, 1),\n",
       " (2012, 2),\n",
       " (2012, 3),\n",
       " (2012, 4),\n",
       " (2013, 1)]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "study_seasons = [(y,s) for y in (2010, 2011, 2012) for s in (1,2,3,4)] + [(2013,1)]\n",
    "study_seasons"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Counts of season-by-year entries for both tables:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2012, 4)    70\n",
       "(2012, 1)    53\n",
       "(2011, 4)    53\n",
       "(2011, 1)    49\n",
       "(2011, 3)    45\n",
       "(2012, 2)    38\n",
       "(2012, 3)    34\n",
       "(2011, 2)    27\n",
       "(2010, 2)    18\n",
       "(2010, 4)    16\n",
       "(2010, 3)    15\n",
       "(2010, 1)    14\n",
       "(2013, 1)     6\n",
       "dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hospitalized_counts = hospitalized_complete.birth_year_season.value_counts()\n",
    "hospitalized_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2012, 4)    1483\n",
       "(2011, 3)    1454\n",
       "(2011, 4)    1451\n",
       "(2012, 1)    1377\n",
       "(2012, 2)    1291\n",
       "(2012, 3)    1126\n",
       "(2011, 1)    1076\n",
       "(2011, 2)    1031\n",
       "(2010, 2)    1009\n",
       "(2010, 4)     879\n",
       "(2010, 3)     866\n",
       "(2010, 1)     691\n",
       "(2013, 1)     356\n",
       "dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newborn_counts = newborns_complete.birth_year_season.value_counts()\n",
    "newborn_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "samples = []\n",
    "k = 10\n",
    "\n",
    "for ys in study_seasons:\n",
    "    # Get number of cases\n",
    "    n = hospitalized_counts[ys]\n",
    "    # Randomize list of all corresponding newborns\n",
    "    newborns_perm = np.random.permutation(newborns_complete.index[newborns_complete.birth_year_season==ys].tolist())\n",
    "    # Select 3 newborns for each case\n",
    "    sample = newborns_perm[:(n*k)]\n",
    "    # Verify that we got enough samples\n",
    "    if len(sample)!=(n*k):\n",
    "        print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)\n",
    "    samples += np.reshape(sample, (n,k)).tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "bogus_samples = pd.DataFrame(samples)\n",
    "try_again = bogus_samples.ix[[45 , 73 , 119 , 122 , 137 , 158 , 174 , 198],[0,1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(samples)==(len(hospitalized_complete[[ys in study_seasons for ys in hospitalized_complete.birth_year_season]]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test that all samples are unique (no double sampling):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(np.unique(np.ravel(samples))) == len(np.ravel(samples))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "samples_df = pd.DataFrame(samples)\n",
    "#samples_df.to_csv('newborn_samples.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Resample for cases that could not get a control:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> H3818</td>\n",
       "      <td> E0070</td>\n",
       "      <td>  G2743</td>\n",
       "      <td> F5003</td>\n",
       "      <td> F1039</td>\n",
       "      <td> G2053</td>\n",
       "      <td> G2779</td>\n",
       "      <td> G2785</td>\n",
       "      <td> H3005</td>\n",
       "      <td> E0065</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> G2044</td>\n",
       "      <td> H3082</td>\n",
       "      <td>  G2786</td>\n",
       "      <td> E0629</td>\n",
       "      <td> G2041</td>\n",
       "      <td> H3073</td>\n",
       "      <td> F1038</td>\n",
       "      <td> H3835</td>\n",
       "      <td> G2071</td>\n",
       "      <td> F1050</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> G2046</td>\n",
       "      <td> G2759</td>\n",
       "      <td> F 1053</td>\n",
       "      <td> H3074</td>\n",
       "      <td> H3811</td>\n",
       "      <td> E0078</td>\n",
       "      <td> E0049</td>\n",
       "      <td> H3805</td>\n",
       "      <td> G2774</td>\n",
       "      <td> F5057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> F5052</td>\n",
       "      <td> G2063</td>\n",
       "      <td>  F5022</td>\n",
       "      <td> G2013</td>\n",
       "      <td> E0636</td>\n",
       "      <td> E0674</td>\n",
       "      <td> H3053</td>\n",
       "      <td> F5000</td>\n",
       "      <td> E0086</td>\n",
       "      <td> F1057</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> H3841</td>\n",
       "      <td> G2765</td>\n",
       "      <td>  E0029</td>\n",
       "      <td> G2011</td>\n",
       "      <td> E0056</td>\n",
       "      <td> E0055</td>\n",
       "      <td> F5011</td>\n",
       "      <td> F1076</td>\n",
       "      <td> G2748</td>\n",
       "      <td> F5025</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       0      1       2      3      4      5      6      7      8      9\n",
       "0  H3818  E0070   G2743  F5003  F1039  G2053  G2779  G2785  H3005  E0065\n",
       "1  G2044  H3082   G2786  E0629  G2041  H3073  F1038  H3835  G2071  F1050\n",
       "2  G2046  G2759  F 1053  H3074  H3811  E0078  E0049  H3805  G2774  F5057\n",
       "3  F5052  G2063   F5022  G2013  E0636  E0674  H3053  F5000  E0086  F1057\n",
       "4  H3841  G2765   E0029  G2011  E0056  E0055  F5011  F1076  G2748  F5025"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samples = pd.read_csv('newborn_samples.csv', index_col=0)\n",
    "samples.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['H3818', 'E0070', 'G2743', ..., 'F15078', 'F15126', 'E14412'], dtype=object)"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "exclude = np.array(samples).ravel()\n",
    "exclude"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "try_again = [tuple(i) for i in np.array(try_again, int)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "exclude = [i for i in exclude if i in newborns_complete.index]\n",
    "newborns_remainder = newborns_complete.drop(exclude)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "new_samples = []\n",
    "k = 10\n",
    "\n",
    "for ys in try_again:\n",
    "    # Get number of cases\n",
    "    n = 1\n",
    "    # Randomize list of all corresponding newborns\n",
    "    newborns_perm = np.random.permutation(newborns_remainder.index[newborns_remainder.birth_year_season==ys].tolist())\n",
    "    # Select k newborns for each case\n",
    "    sample = newborns_perm[:(n*k)]\n",
    "    # Verify that we got enough samples\n",
    "    if len(sample)!=(n*k):\n",
    "        print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)\n",
    "    new_samples += np.reshape(sample, (n,k)).tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(np.unique(np.ravel(new_samples))) == len(np.ravel(new_samples))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "new_samples_df = pd.DataFrame(new_samples)\n",
    "#new_samples_df.to_csv('replacement_newborn_samples.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Import data from REDCap"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from redcap import Project\n",
	"\n",
	"api_url = 'https://redcap.vanderbilt.edu/api/'\n",
	"\n",
	"newborns_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/newborns_api_key.txt\").read()\n",
	"newborns_proj = Project(api_url, newborns_key)\n",
	"newborns = newborns_proj.export_records(format='df', df_kwargs={'index_col': newborns_proj.field_names[0]})\n",
	"\n",
	"hospitalized_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt\").read()\n",
	"hospitalized_proj = Project(api_url, hospitalized_key)\n",
	"hospitalized = hospitalized_proj.export_records(format='df', df_kwargs={'index_col': hospitalized_proj.field_names[0]})"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"hospitalized_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt\").read()\n",
	"hospitalized_proj = Project(api_url, hospitalized_key)\n",
	"hospitalized_proj.metadata"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(14093, 125)"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"newborns.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(3175, 338)"
	]
	},
	"execution_count": 3,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"hospitalized.shape"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Hospitalized with missing ID numbers:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Index([u'D3543'], dtype=object)"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"hospitalized[(hospitalized.newborn_id==1) & (hospitalized.newborn_id_number.isnull())].index"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Create `season` variable for newborns:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"case_id\n",
	"A0001 2010-02-02\n",
	"A0002 2010-01-29\n",
	"A0003 2009-03-28\n",
	"Name: child_birth_date, dtype: object"
	]
	},
	"execution_count": 5,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"hospitalized.child_birth_date[:3]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"case_id_newborn\n",
	"B1098 2010-11-22\n",
	"B1545 2010-06-27\n",
	"C2779 2010-12-20\n",
	"Name: birth_date_newborn, dtype: object"
	]
	},
	"execution_count": 6,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"newborns.birth_date_newborn[:3]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#hospitalized['birth_date'] = pd.to_datetime(hospitalized.child_birth_date, errors='raise', format='%Y-%m-%d')\n",
	"hospitalized['birth_date'] = [pd.to_datetime(d) for d in hospitalized.child_birth_date]\n",
	"#newborns['birth_date'] = pd.to_datetime(newborns.birth_date_newborn, errors='raise')\n",
	"newborns['birth_date'] = [pd.to_datetime(d) for d in newborns.birth_date_newborn]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"count 3175\n",
	"unique 1185\n",
	"first 2008-05-20 00:00:00\n",
	"last 2013-12-20 00:00:00\n",
	"top 2011-12-14 00:00:00\n",
	"freq 13\n",
	"dtype: object"
	]
	},
	"execution_count": 8,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"hospitalized.birth_date.describe()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"count 14090\n",
	"unique 967\n",
	"first 2010-01-29 00:00:00\n",
	"last 2013-01-31 00:00:00\n",
	"top 2011-08-24 00:00:00\n",
	"freq 38\n",
	"dtype: object"
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"newborns.birth_date.describe()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Identify and remove bogus dates:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"Series([], dtype: datetime64[ns])"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"hospitalized.birth_date[[type(d)!=pd.tslib.Timestamp for d in hospitalized.birth_date]]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"case_id_newborn\n",
	"B1098 2010-11-22 00:00:00\n",
	"B1545 2010-06-27 00:00:00\n",
	"C2779 2010-12-20 00:00:00\n",
	"E`10602 2012-06-11 00:00:00\n",
	"E0001 2010-01-30 00:00:00\n",
	"E0002 2010-01-29 00:00:00\n",
	"E0003 2010-01-30 00:00:00\n",
	"E0004 2010-01-30 00:00:00\n",
	"E0005 2010-01-31 00:00:00\n",
	"E0006 2010-01-31 00:00:00\n",
	"E0007 2010-02-01 00:00:00\n",
	"E0008 2010-02-01 00:00:00\n",
	"E0009 2010-02-02 00:00:00\n",
	"E0010 2010-02-02 00:00:00\n",
	"E0011 2010-02-06 00:00:00\n",
	"...\n",
	"H7985 2011-10-24 00:00:00\n",
	"H7986 2011-10-24 00:00:00\n",
	"H7987 2011-10-24 00:00:00\n",
	"H7988 2011-10-24 00:00:00\n",
	"H7989 2011-10-25 00:00:00\n",
	"H7990 2011-10-25 00:00:00\n",
	"H7991 2011-10-25 00:00:00\n",
	"H7992 2011-10-25 00:00:00\n",
	"H7993 2011-10-25 00:00:00\n",
	"H7994 2011-10-25 00:00:00\n",
	"H7995 2011-10-25 00:00:00\n",
	"H7996 2011-10-27 00:00:00\n",
	"H7997 2011-10-26 00:00:00\n",
	"H7998 2011-10-26 00:00:00\n",
	"H7999 2011-10-26 00:00:00\n",
	"Name: birth_date, Length: 14093, dtype: datetime64[ns]"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"newborns.birth_date[[type(d)!=datetime.datetime for d in newborns.birth_date]]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(Timestamp('2008-05-20 00:00:00', tz=None),\n",
	" Timestamp('2013-12-20 00:00:00', tz=None))"
	]
	},
	"execution_count": 12,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"min(hospitalized.birth_date[hospitalized.birth_date.notnull()]), \\\n",
	"max(hospitalized.birth_date[hospitalized.birth_date.notnull()])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(Timestamp('2010-01-29 00:00:00', tz=None),\n",
	" Timestamp('2013-01-31 00:00:00', tz=None))"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"newborns.birth_date[newborns.birth_date.notnull()].min(), \\\n",
	"newborns.birth_date[newborns.birth_date.notnull()].max()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(Timestamp('2010-01-29 00:00:00', tz=None),\n",
	" Timestamp('2013-01-31 00:00:00', tz=None))"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"newborns.birth_date[newborns.birth_date.notnull()].min(), \\\n",
	"newborns.birth_date[newborns.birth_date.notnull()].max()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Remove records with null birth dates:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"newborns_complete = newborns[newborns.birth_date.notnull()]\n",
	"hospitalized_complete = hospitalized[hospitalized.birth_date.notnull() & hospitalized.newborn_id_number.notnull()]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Create season-by-year variable"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"newborns_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])1 or (b.month in [3,4,5])2 or \\\n",
	" (b.month in [6,7,8])*3 or 4) for b in newborns_complete.birth_date]\n",
	"hospitalized_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])1 or (b.month in [3,4,5])2 or \\\n",
	" (b.month in [6,7,8])*3 or 4) for b in hospitalized_complete.birth_date]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"List of all study seasons:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"[(2010, 1),\n",
	" (2010, 2),\n",
	" (2010, 3),\n",
	" (2010, 4),\n",
	" (2011, 1),\n",
	" (2011, 2),\n",
	" (2011, 3),\n",
	" (2011, 4),\n",
	" (2012, 1),\n",
	" (2012, 2),\n",
	" (2012, 3),\n",
	" (2012, 4),\n",
	" (2013, 1)]"
	]
	},
	"execution_count": 19,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"study_seasons = [(y,s) for y in (2010, 2011, 2012) for s in (1,2,3,4)] + [(2013,1)]\n",
	"study_seasons"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Counts of season-by-year entries for both tables:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 20,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(2012, 4) 70\n",
	"(2012, 1) 53\n",
	"(2011, 4) 53\n",
	"(2011, 1) 49\n",
	"(2011, 3) 45\n",
	"(2012, 2) 38\n",
	"(2012, 3) 34\n",
	"(2011, 2) 27\n",
	"(2010, 2) 18\n",
	"(2010, 4) 16\n",
	"(2010, 3) 15\n",
	"(2010, 1) 14\n",
	"(2013, 1) 6\n",
	"dtype: int64"
	]
	},
	"execution_count": 20,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"hospitalized_counts = hospitalized_complete.birth_year_season.value_counts()\n",
	"hospitalized_counts"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(2012, 4) 1483\n",
	"(2011, 3) 1454\n",
	"(2011, 4) 1451\n",
	"(2012, 1) 1377\n",
	"(2012, 2) 1291\n",
	"(2012, 3) 1126\n",
	"(2011, 1) 1076\n",
	"(2011, 2) 1031\n",
	"(2010, 2) 1009\n",
	"(2010, 4) 879\n",
	"(2010, 3) 866\n",
	"(2010, 1) 691\n",
	"(2013, 1) 356\n",
	"dtype: int64"
	]
	},
	"execution_count": 21,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"newborn_counts = newborns_complete.birth_year_season.value_counts()\n",
	"newborn_counts"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 114,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"samples = []\n",
	"k = 10\n",
	"\n",
	"for ys in study_seasons:\n",
	" # Get number of cases\n",
	" n = hospitalized_counts[ys]\n",
	" # Randomize list of all corresponding newborns\n",
	" newborns_perm = np.random.permutation(newborns_complete.index[newborns_complete.birth_year_season==ys].tolist())\n",
	" # Select 3 newborns for each case\n",
	" sample = newborns_perm[:(n*k)]\n",
	" # Verify that we got enough samples\n",
	" if len(sample)!=(n*k):\n",
	" print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)\n",
	" samples += np.reshape(sample, (n,k)).tolist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 42,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"bogus_samples = pd.DataFrame(samples)\n",
	"try_again = bogus_samples.ix[[45 , 73 , 119 , 122 , 137 , 158 , 174 , 198],[0,1]]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 23,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(samples)==(len(hospitalized_complete[[ys in study_seasons for ys in hospitalized_complete.birth_year_season]]))"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Test that all samples are unique (no double sampling):"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 24,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(np.unique(np.ravel(samples))) == len(np.ravel(samples))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"samples_df = pd.DataFrame(samples)\n",
	"#samples_df.to_csv('newborn_samples.csv')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Resample for cases that could not get a control:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 69,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" <th>3</th>\n",
	" <th>4</th>\n",
	" <th>5</th>\n",
	" <th>6</th>\n",
	" <th>7</th>\n",
	" <th>8</th>\n",
	" <th>9</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td> H3818</td>\n",
	" <td> E0070</td>\n",
	" <td> G2743</td>\n",
	" <td> F5003</td>\n",
	" <td> F1039</td>\n",
	" <td> G2053</td>\n",
	" <td> G2779</td>\n",
	" <td> G2785</td>\n",
	" <td> H3005</td>\n",
	" <td> E0065</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td> G2044</td>\n",
	" <td> H3082</td>\n",
	" <td> G2786</td>\n",
	" <td> E0629</td>\n",
	" <td> G2041</td>\n",
	" <td> H3073</td>\n",
	" <td> F1038</td>\n",
	" <td> H3835</td>\n",
	" <td> G2071</td>\n",
	" <td> F1050</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td> G2046</td>\n",
	" <td> G2759</td>\n",
	" <td> F 1053</td>\n",
	" <td> H3074</td>\n",
	" <td> H3811</td>\n",
	" <td> E0078</td>\n",
	" <td> E0049</td>\n",
	" <td> H3805</td>\n",
	" <td> G2774</td>\n",
	" <td> F5057</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td> F5052</td>\n",
	" <td> G2063</td>\n",
	" <td> F5022</td>\n",
	" <td> G2013</td>\n",
	" <td> E0636</td>\n",
	" <td> E0674</td>\n",
	" <td> H3053</td>\n",
	" <td> F5000</td>\n",
	" <td> E0086</td>\n",
	" <td> F1057</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td> H3841</td>\n",
	" <td> G2765</td>\n",
	" <td> E0029</td>\n",
	" <td> G2011</td>\n",
	" <td> E0056</td>\n",
	" <td> E0055</td>\n",
	" <td> F5011</td>\n",
	" <td> F1076</td>\n",
	" <td> G2748</td>\n",
	" <td> F5025</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0 1 2 3 4 5 6 7 8 9\n",
	"0 H3818 E0070 G2743 F5003 F1039 G2053 G2779 G2785 H3005 E0065\n",
	"1 G2044 H3082 G2786 E0629 G2041 H3073 F1038 H3835 G2071 F1050\n",
	"2 G2046 G2759 F 1053 H3074 H3811 E0078 E0049 H3805 G2774 F5057\n",
	"3 F5052 G2063 F5022 G2013 E0636 E0674 H3053 F5000 E0086 F1057\n",
	"4 H3841 G2765 E0029 G2011 E0056 E0055 F5011 F1076 G2748 F5025"
	]
	},
	"execution_count": 69,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"samples = pd.read_csv('newborn_samples.csv', index_col=0)\n",
	"samples.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 77,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"array(['H3818', 'E0070', 'G2743', ..., 'F15078', 'F15126', 'E14412'], dtype=object)"
	]
	},
	"execution_count": 77,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"exclude = np.array(samples).ravel()\n",
	"exclude"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 62,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"try_again = [tuple(i) for i in np.array(try_again, int)]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 90,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"exclude = [i for i in exclude if i in newborns_complete.index]\n",
	"newborns_remainder = newborns_complete.drop(exclude)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 117,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"new_samples = []\n",
	"k = 10\n",
	"\n",
	"for ys in try_again:\n",
	" # Get number of cases\n",
	" n = 1\n",
	" # Randomize list of all corresponding newborns\n",
	" newborns_perm = np.random.permutation(newborns_remainder.index[newborns_remainder.birth_year_season==ys].tolist())\n",
	" # Select k newborns for each case\n",
	" sample = newborns_perm[:(n*k)]\n",
	" # Verify that we got enough samples\n",
	" if len(sample)!=(n*k):\n",
	" print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)\n",
	" new_samples += np.reshape(sample, (n,k)).tolist()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 118,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 118,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(np.unique(np.ravel(new_samples))) == len(np.ravel(new_samples))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 119,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"new_samples_df = pd.DataFrame(new_samples)\n",
	"#new_samples_df.to_csv('replacement_newborn_samples.csv')"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}