Skip to content

Instantly share code, notes, and snippets.

@fonnesbeck
Created September 24, 2018 21:41
Show Gist options
  • Save fonnesbeck/d7359d69fb16dbe2bc79fae739b01404 to your computer and use it in GitHub Desktop.
Save fonnesbeck/d7359d69fb16dbe2bc79fae739b01404 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import data from REDCap"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from redcap import Project\n",
"\n",
"api_url = 'https://redcap.vanderbilt.edu/api/'\n",
"\n",
"newborns_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/newborns_api_key.txt\").read()\n",
"newborns_proj = Project(api_url, newborns_key)\n",
"newborns = newborns_proj.export_records(format='df', df_kwargs={'index_col': newborns_proj.field_names[0]})\n",
"\n",
"hospitalized_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt\").read()\n",
"hospitalized_proj = Project(api_url, hospitalized_key)\n",
"hospitalized = hospitalized_proj.export_records(format='df', df_kwargs={'index_col': hospitalized_proj.field_names[0]})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"hospitalized_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt\").read()\n",
"hospitalized_proj = Project(api_url, hospitalized_key)\n",
"hospitalized_proj.metadata"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(14093, 125)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"newborns.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(3175, 338)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospitalized.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Hospitalized with missing ID numbers:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Index([u'D3543'], dtype=object)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospitalized[(hospitalized.newborn_id==1) & (hospitalized.newborn_id_number.isnull())].index"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create `season` variable for newborns:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"case_id\n",
"A0001 2010-02-02\n",
"A0002 2010-01-29\n",
"A0003 2009-03-28\n",
"Name: child_birth_date, dtype: object"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospitalized.child_birth_date[:3]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"case_id_newborn\n",
"B1098 2010-11-22\n",
"B1545 2010-06-27\n",
"C2779 2010-12-20\n",
"Name: birth_date_newborn, dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"newborns.birth_date_newborn[:3]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#hospitalized['birth_date'] = pd.to_datetime(hospitalized.child_birth_date, errors='raise', format='%Y-%m-%d')\n",
"hospitalized['birth_date'] = [pd.to_datetime(d) for d in hospitalized.child_birth_date]\n",
"#newborns['birth_date'] = pd.to_datetime(newborns.birth_date_newborn, errors='raise')\n",
"newborns['birth_date'] = [pd.to_datetime(d) for d in newborns.birth_date_newborn]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"count 3175\n",
"unique 1185\n",
"first 2008-05-20 00:00:00\n",
"last 2013-12-20 00:00:00\n",
"top 2011-12-14 00:00:00\n",
"freq 13\n",
"dtype: object"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospitalized.birth_date.describe()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"count 14090\n",
"unique 967\n",
"first 2010-01-29 00:00:00\n",
"last 2013-01-31 00:00:00\n",
"top 2011-08-24 00:00:00\n",
"freq 38\n",
"dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"newborns.birth_date.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Identify and remove bogus dates:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Series([], dtype: datetime64[ns])"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospitalized.birth_date[[type(d)!=pd.tslib.Timestamp for d in hospitalized.birth_date]]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"case_id_newborn\n",
"B1098 2010-11-22 00:00:00\n",
"B1545 2010-06-27 00:00:00\n",
"C2779 2010-12-20 00:00:00\n",
"E`10602 2012-06-11 00:00:00\n",
"E0001 2010-01-30 00:00:00\n",
"E0002 2010-01-29 00:00:00\n",
"E0003 2010-01-30 00:00:00\n",
"E0004 2010-01-30 00:00:00\n",
"E0005 2010-01-31 00:00:00\n",
"E0006 2010-01-31 00:00:00\n",
"E0007 2010-02-01 00:00:00\n",
"E0008 2010-02-01 00:00:00\n",
"E0009 2010-02-02 00:00:00\n",
"E0010 2010-02-02 00:00:00\n",
"E0011 2010-02-06 00:00:00\n",
"...\n",
"H7985 2011-10-24 00:00:00\n",
"H7986 2011-10-24 00:00:00\n",
"H7987 2011-10-24 00:00:00\n",
"H7988 2011-10-24 00:00:00\n",
"H7989 2011-10-25 00:00:00\n",
"H7990 2011-10-25 00:00:00\n",
"H7991 2011-10-25 00:00:00\n",
"H7992 2011-10-25 00:00:00\n",
"H7993 2011-10-25 00:00:00\n",
"H7994 2011-10-25 00:00:00\n",
"H7995 2011-10-25 00:00:00\n",
"H7996 2011-10-27 00:00:00\n",
"H7997 2011-10-26 00:00:00\n",
"H7998 2011-10-26 00:00:00\n",
"H7999 2011-10-26 00:00:00\n",
"Name: birth_date, Length: 14093, dtype: datetime64[ns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"newborns.birth_date[[type(d)!=datetime.datetime for d in newborns.birth_date]]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(Timestamp('2008-05-20 00:00:00', tz=None),\n",
" Timestamp('2013-12-20 00:00:00', tz=None))"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"min(hospitalized.birth_date[hospitalized.birth_date.notnull()]), \\\n",
"max(hospitalized.birth_date[hospitalized.birth_date.notnull()])"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(Timestamp('2010-01-29 00:00:00', tz=None),\n",
" Timestamp('2013-01-31 00:00:00', tz=None))"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"newborns.birth_date[newborns.birth_date.notnull()].min(), \\\n",
"newborns.birth_date[newborns.birth_date.notnull()].max()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(Timestamp('2010-01-29 00:00:00', tz=None),\n",
" Timestamp('2013-01-31 00:00:00', tz=None))"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"newborns.birth_date[newborns.birth_date.notnull()].min(), \\\n",
"newborns.birth_date[newborns.birth_date.notnull()].max()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove records with null birth dates:"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"newborns_complete = newborns[newborns.birth_date.notnull()]\n",
"hospitalized_complete = hospitalized[hospitalized.birth_date.notnull() & hospitalized.newborn_id_number.notnull()]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create season-by-year variable"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"newborns_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \\\n",
" (b.month in [6,7,8])*3 or 4) for b in newborns_complete.birth_date]\n",
"hospitalized_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \\\n",
" (b.month in [6,7,8])*3 or 4) for b in hospitalized_complete.birth_date]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"List of all study seasons:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(2010, 1),\n",
" (2010, 2),\n",
" (2010, 3),\n",
" (2010, 4),\n",
" (2011, 1),\n",
" (2011, 2),\n",
" (2011, 3),\n",
" (2011, 4),\n",
" (2012, 1),\n",
" (2012, 2),\n",
" (2012, 3),\n",
" (2012, 4),\n",
" (2013, 1)]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"study_seasons = [(y,s) for y in (2010, 2011, 2012) for s in (1,2,3,4)] + [(2013,1)]\n",
"study_seasons"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Counts of season-by-year entries for both tables:"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(2012, 4) 70\n",
"(2012, 1) 53\n",
"(2011, 4) 53\n",
"(2011, 1) 49\n",
"(2011, 3) 45\n",
"(2012, 2) 38\n",
"(2012, 3) 34\n",
"(2011, 2) 27\n",
"(2010, 2) 18\n",
"(2010, 4) 16\n",
"(2010, 3) 15\n",
"(2010, 1) 14\n",
"(2013, 1) 6\n",
"dtype: int64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hospitalized_counts = hospitalized_complete.birth_year_season.value_counts()\n",
"hospitalized_counts"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(2012, 4) 1483\n",
"(2011, 3) 1454\n",
"(2011, 4) 1451\n",
"(2012, 1) 1377\n",
"(2012, 2) 1291\n",
"(2012, 3) 1126\n",
"(2011, 1) 1076\n",
"(2011, 2) 1031\n",
"(2010, 2) 1009\n",
"(2010, 4) 879\n",
"(2010, 3) 866\n",
"(2010, 1) 691\n",
"(2013, 1) 356\n",
"dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"newborn_counts = newborns_complete.birth_year_season.value_counts()\n",
"newborn_counts"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"samples = []\n",
"k = 10\n",
"\n",
"for ys in study_seasons:\n",
" # Get number of cases\n",
" n = hospitalized_counts[ys]\n",
" # Randomize list of all corresponding newborns\n",
" newborns_perm = np.random.permutation(newborns_complete.index[newborns_complete.birth_year_season==ys].tolist())\n",
" # Select 3 newborns for each case\n",
" sample = newborns_perm[:(n*k)]\n",
" # Verify that we got enough samples\n",
" if len(sample)!=(n*k):\n",
" print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)\n",
" samples += np.reshape(sample, (n,k)).tolist()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"bogus_samples = pd.DataFrame(samples)\n",
"try_again = bogus_samples.ix[[45 , 73 , 119 , 122 , 137 , 158 , 174 , 198],[0,1]]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(samples)==(len(hospitalized_complete[[ys in study_seasons for ys in hospitalized_complete.birth_year_season]]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Test that all samples are unique (no double sampling):"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(np.unique(np.ravel(samples))) == len(np.ravel(samples))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"samples_df = pd.DataFrame(samples)\n",
"#samples_df.to_csv('newborn_samples.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Resample for cases that could not get a control:"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> H3818</td>\n",
" <td> E0070</td>\n",
" <td> G2743</td>\n",
" <td> F5003</td>\n",
" <td> F1039</td>\n",
" <td> G2053</td>\n",
" <td> G2779</td>\n",
" <td> G2785</td>\n",
" <td> H3005</td>\n",
" <td> E0065</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> G2044</td>\n",
" <td> H3082</td>\n",
" <td> G2786</td>\n",
" <td> E0629</td>\n",
" <td> G2041</td>\n",
" <td> H3073</td>\n",
" <td> F1038</td>\n",
" <td> H3835</td>\n",
" <td> G2071</td>\n",
" <td> F1050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> G2046</td>\n",
" <td> G2759</td>\n",
" <td> F 1053</td>\n",
" <td> H3074</td>\n",
" <td> H3811</td>\n",
" <td> E0078</td>\n",
" <td> E0049</td>\n",
" <td> H3805</td>\n",
" <td> G2774</td>\n",
" <td> F5057</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> F5052</td>\n",
" <td> G2063</td>\n",
" <td> F5022</td>\n",
" <td> G2013</td>\n",
" <td> E0636</td>\n",
" <td> E0674</td>\n",
" <td> H3053</td>\n",
" <td> F5000</td>\n",
" <td> E0086</td>\n",
" <td> F1057</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> H3841</td>\n",
" <td> G2765</td>\n",
" <td> E0029</td>\n",
" <td> G2011</td>\n",
" <td> E0056</td>\n",
" <td> E0055</td>\n",
" <td> F5011</td>\n",
" <td> F1076</td>\n",
" <td> G2748</td>\n",
" <td> F5025</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 6 7 8 9\n",
"0 H3818 E0070 G2743 F5003 F1039 G2053 G2779 G2785 H3005 E0065\n",
"1 G2044 H3082 G2786 E0629 G2041 H3073 F1038 H3835 G2071 F1050\n",
"2 G2046 G2759 F 1053 H3074 H3811 E0078 E0049 H3805 G2774 F5057\n",
"3 F5052 G2063 F5022 G2013 E0636 E0674 H3053 F5000 E0086 F1057\n",
"4 H3841 G2765 E0029 G2011 E0056 E0055 F5011 F1076 G2748 F5025"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples = pd.read_csv('newborn_samples.csv', index_col=0)\n",
"samples.head()"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array(['H3818', 'E0070', 'G2743', ..., 'F15078', 'F15126', 'E14412'], dtype=object)"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"exclude = np.array(samples).ravel()\n",
"exclude"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"try_again = [tuple(i) for i in np.array(try_again, int)]"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"exclude = [i for i in exclude if i in newborns_complete.index]\n",
"newborns_remainder = newborns_complete.drop(exclude)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"new_samples = []\n",
"k = 10\n",
"\n",
"for ys in try_again:\n",
" # Get number of cases\n",
" n = 1\n",
" # Randomize list of all corresponding newborns\n",
" newborns_perm = np.random.permutation(newborns_remainder.index[newborns_remainder.birth_year_season==ys].tolist())\n",
" # Select k newborns for each case\n",
" sample = newborns_perm[:(n*k)]\n",
" # Verify that we got enough samples\n",
" if len(sample)!=(n*k):\n",
" print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)\n",
" new_samples += np.reshape(sample, (n,k)).tolist()"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(np.unique(np.ravel(new_samples))) == len(np.ravel(new_samples))"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"new_samples_df = pd.DataFrame(new_samples)\n",
"#new_samples_df.to_csv('replacement_newborn_samples.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment