Created
September 24, 2018 21:41
-
-
Save fonnesbeck/d7359d69fb16dbe2bc79fae739b01404 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Import data from REDCap" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from redcap import Project\n", | |
"\n", | |
"api_url = 'https://redcap.vanderbilt.edu/api/'\n", | |
"\n", | |
"newborns_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/newborns_api_key.txt\").read()\n", | |
"newborns_proj = Project(api_url, newborns_key)\n", | |
"newborns = newborns_proj.export_records(format='df', df_kwargs={'index_col': newborns_proj.field_names[0]})\n", | |
"\n", | |
"hospitalized_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt\").read()\n", | |
"hospitalized_proj = Project(api_url, hospitalized_key)\n", | |
"hospitalized = hospitalized_proj.export_records(format='df', df_kwargs={'index_col': hospitalized_proj.field_names[0]})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"hospitalized_key = open(\"/Users/fonnescj/Dropbox/Halasa Biostats/hospitalized_api_key.txt\").read()\n", | |
"hospitalized_proj = Project(api_url, hospitalized_key)\n", | |
"hospitalized_proj.metadata" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(14093, 125)" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"newborns.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(3175, 338)" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hospitalized.shape" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Hospitalized with missing ID numbers:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Index([u'D3543'], dtype=object)" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hospitalized[(hospitalized.newborn_id==1) & (hospitalized.newborn_id_number.isnull())].index" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Create `season` variable for newborns:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"case_id\n", | |
"A0001 2010-02-02\n", | |
"A0002 2010-01-29\n", | |
"A0003 2009-03-28\n", | |
"Name: child_birth_date, dtype: object" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hospitalized.child_birth_date[:3]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"case_id_newborn\n", | |
"B1098 2010-11-22\n", | |
"B1545 2010-06-27\n", | |
"C2779 2010-12-20\n", | |
"Name: birth_date_newborn, dtype: object" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"newborns.birth_date_newborn[:3]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#hospitalized['birth_date'] = pd.to_datetime(hospitalized.child_birth_date, errors='raise', format='%Y-%m-%d')\n", | |
"hospitalized['birth_date'] = [pd.to_datetime(d) for d in hospitalized.child_birth_date]\n", | |
"#newborns['birth_date'] = pd.to_datetime(newborns.birth_date_newborn, errors='raise')\n", | |
"newborns['birth_date'] = [pd.to_datetime(d) for d in newborns.birth_date_newborn]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"count 3175\n", | |
"unique 1185\n", | |
"first 2008-05-20 00:00:00\n", | |
"last 2013-12-20 00:00:00\n", | |
"top 2011-12-14 00:00:00\n", | |
"freq 13\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hospitalized.birth_date.describe()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"count 14090\n", | |
"unique 967\n", | |
"first 2010-01-29 00:00:00\n", | |
"last 2013-01-31 00:00:00\n", | |
"top 2011-08-24 00:00:00\n", | |
"freq 38\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"newborns.birth_date.describe()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Identify and remove bogus dates:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Series([], dtype: datetime64[ns])" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hospitalized.birth_date[[type(d)!=pd.tslib.Timestamp for d in hospitalized.birth_date]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"case_id_newborn\n", | |
"B1098 2010-11-22 00:00:00\n", | |
"B1545 2010-06-27 00:00:00\n", | |
"C2779 2010-12-20 00:00:00\n", | |
"E`10602 2012-06-11 00:00:00\n", | |
"E0001 2010-01-30 00:00:00\n", | |
"E0002 2010-01-29 00:00:00\n", | |
"E0003 2010-01-30 00:00:00\n", | |
"E0004 2010-01-30 00:00:00\n", | |
"E0005 2010-01-31 00:00:00\n", | |
"E0006 2010-01-31 00:00:00\n", | |
"E0007 2010-02-01 00:00:00\n", | |
"E0008 2010-02-01 00:00:00\n", | |
"E0009 2010-02-02 00:00:00\n", | |
"E0010 2010-02-02 00:00:00\n", | |
"E0011 2010-02-06 00:00:00\n", | |
"...\n", | |
"H7985 2011-10-24 00:00:00\n", | |
"H7986 2011-10-24 00:00:00\n", | |
"H7987 2011-10-24 00:00:00\n", | |
"H7988 2011-10-24 00:00:00\n", | |
"H7989 2011-10-25 00:00:00\n", | |
"H7990 2011-10-25 00:00:00\n", | |
"H7991 2011-10-25 00:00:00\n", | |
"H7992 2011-10-25 00:00:00\n", | |
"H7993 2011-10-25 00:00:00\n", | |
"H7994 2011-10-25 00:00:00\n", | |
"H7995 2011-10-25 00:00:00\n", | |
"H7996 2011-10-27 00:00:00\n", | |
"H7997 2011-10-26 00:00:00\n", | |
"H7998 2011-10-26 00:00:00\n", | |
"H7999 2011-10-26 00:00:00\n", | |
"Name: birth_date, Length: 14093, dtype: datetime64[ns]" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"newborns.birth_date[[type(d)!=datetime.datetime for d in newborns.birth_date]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(Timestamp('2008-05-20 00:00:00', tz=None),\n", | |
" Timestamp('2013-12-20 00:00:00', tz=None))" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"min(hospitalized.birth_date[hospitalized.birth_date.notnull()]), \\\n", | |
"max(hospitalized.birth_date[hospitalized.birth_date.notnull()])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(Timestamp('2010-01-29 00:00:00', tz=None),\n", | |
" Timestamp('2013-01-31 00:00:00', tz=None))" | |
] | |
}, | |
"execution_count": 13, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"newborns.birth_date[newborns.birth_date.notnull()].min(), \\\n", | |
"newborns.birth_date[newborns.birth_date.notnull()].max()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(Timestamp('2010-01-29 00:00:00', tz=None),\n", | |
" Timestamp('2013-01-31 00:00:00', tz=None))" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"newborns.birth_date[newborns.birth_date.notnull()].min(), \\\n", | |
"newborns.birth_date[newborns.birth_date.notnull()].max()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Remove records with null birth dates:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"newborns_complete = newborns[newborns.birth_date.notnull()]\n", | |
"hospitalized_complete = hospitalized[hospitalized.birth_date.notnull() & hospitalized.newborn_id_number.notnull()]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Create season-by-year variable" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"newborns_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \\\n", | |
" (b.month in [6,7,8])*3 or 4) for b in newborns_complete.birth_date]\n", | |
"hospitalized_complete['birth_year_season'] = [(b.year, (b.month in [12,1,2])*1 or (b.month in [3,4,5])*2 or \\\n", | |
" (b.month in [6,7,8])*3 or 4) for b in hospitalized_complete.birth_date]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"List of all study seasons:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[(2010, 1),\n", | |
" (2010, 2),\n", | |
" (2010, 3),\n", | |
" (2010, 4),\n", | |
" (2011, 1),\n", | |
" (2011, 2),\n", | |
" (2011, 3),\n", | |
" (2011, 4),\n", | |
" (2012, 1),\n", | |
" (2012, 2),\n", | |
" (2012, 3),\n", | |
" (2012, 4),\n", | |
" (2013, 1)]" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"study_seasons = [(y,s) for y in (2010, 2011, 2012) for s in (1,2,3,4)] + [(2013,1)]\n", | |
"study_seasons" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Counts of season-by-year entries for both tables:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(2012, 4) 70\n", | |
"(2012, 1) 53\n", | |
"(2011, 4) 53\n", | |
"(2011, 1) 49\n", | |
"(2011, 3) 45\n", | |
"(2012, 2) 38\n", | |
"(2012, 3) 34\n", | |
"(2011, 2) 27\n", | |
"(2010, 2) 18\n", | |
"(2010, 4) 16\n", | |
"(2010, 3) 15\n", | |
"(2010, 1) 14\n", | |
"(2013, 1) 6\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 20, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hospitalized_counts = hospitalized_complete.birth_year_season.value_counts()\n", | |
"hospitalized_counts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(2012, 4) 1483\n", | |
"(2011, 3) 1454\n", | |
"(2011, 4) 1451\n", | |
"(2012, 1) 1377\n", | |
"(2012, 2) 1291\n", | |
"(2012, 3) 1126\n", | |
"(2011, 1) 1076\n", | |
"(2011, 2) 1031\n", | |
"(2010, 2) 1009\n", | |
"(2010, 4) 879\n", | |
"(2010, 3) 866\n", | |
"(2010, 1) 691\n", | |
"(2013, 1) 356\n", | |
"dtype: int64" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"newborn_counts = newborns_complete.birth_year_season.value_counts()\n", | |
"newborn_counts" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"samples = []\n", | |
"k = 10\n", | |
"\n", | |
"for ys in study_seasons:\n", | |
" # Get number of cases\n", | |
" n = hospitalized_counts[ys]\n", | |
" # Randomize list of all corresponding newborns\n", | |
" newborns_perm = np.random.permutation(newborns_complete.index[newborns_complete.birth_year_season==ys].tolist())\n", | |
" # Select 3 newborns for each case\n", | |
" sample = newborns_perm[:(n*k)]\n", | |
" # Verify that we got enough samples\n", | |
" if len(sample)!=(n*k):\n", | |
" print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)\n", | |
" samples += np.reshape(sample, (n,k)).tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"bogus_samples = pd.DataFrame(samples)\n", | |
"try_again = bogus_samples.ix[[45 , 73 , 119 , 122 , 137 , 158 , 174 , 198],[0,1]]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 23, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(samples)==(len(hospitalized_complete[[ys in study_seasons for ys in hospitalized_complete.birth_year_season]]))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Test that all samples are unique (no double sampling):" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(np.unique(np.ravel(samples))) == len(np.ravel(samples))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"samples_df = pd.DataFrame(samples)\n", | |
"#samples_df.to_csv('newborn_samples.csv')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Resample for cases that could not get a control:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>0</th>\n", | |
" <th>1</th>\n", | |
" <th>2</th>\n", | |
" <th>3</th>\n", | |
" <th>4</th>\n", | |
" <th>5</th>\n", | |
" <th>6</th>\n", | |
" <th>7</th>\n", | |
" <th>8</th>\n", | |
" <th>9</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> H3818</td>\n", | |
" <td> E0070</td>\n", | |
" <td> G2743</td>\n", | |
" <td> F5003</td>\n", | |
" <td> F1039</td>\n", | |
" <td> G2053</td>\n", | |
" <td> G2779</td>\n", | |
" <td> G2785</td>\n", | |
" <td> H3005</td>\n", | |
" <td> E0065</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> G2044</td>\n", | |
" <td> H3082</td>\n", | |
" <td> G2786</td>\n", | |
" <td> E0629</td>\n", | |
" <td> G2041</td>\n", | |
" <td> H3073</td>\n", | |
" <td> F1038</td>\n", | |
" <td> H3835</td>\n", | |
" <td> G2071</td>\n", | |
" <td> F1050</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> G2046</td>\n", | |
" <td> G2759</td>\n", | |
" <td> F 1053</td>\n", | |
" <td> H3074</td>\n", | |
" <td> H3811</td>\n", | |
" <td> E0078</td>\n", | |
" <td> E0049</td>\n", | |
" <td> H3805</td>\n", | |
" <td> G2774</td>\n", | |
" <td> F5057</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> F5052</td>\n", | |
" <td> G2063</td>\n", | |
" <td> F5022</td>\n", | |
" <td> G2013</td>\n", | |
" <td> E0636</td>\n", | |
" <td> E0674</td>\n", | |
" <td> H3053</td>\n", | |
" <td> F5000</td>\n", | |
" <td> E0086</td>\n", | |
" <td> F1057</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> H3841</td>\n", | |
" <td> G2765</td>\n", | |
" <td> E0029</td>\n", | |
" <td> G2011</td>\n", | |
" <td> E0056</td>\n", | |
" <td> E0055</td>\n", | |
" <td> F5011</td>\n", | |
" <td> F1076</td>\n", | |
" <td> G2748</td>\n", | |
" <td> F5025</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" 0 1 2 3 4 5 6 7 8 9\n", | |
"0 H3818 E0070 G2743 F5003 F1039 G2053 G2779 G2785 H3005 E0065\n", | |
"1 G2044 H3082 G2786 E0629 G2041 H3073 F1038 H3835 G2071 F1050\n", | |
"2 G2046 G2759 F 1053 H3074 H3811 E0078 E0049 H3805 G2774 F5057\n", | |
"3 F5052 G2063 F5022 G2013 E0636 E0674 H3053 F5000 E0086 F1057\n", | |
"4 H3841 G2765 E0029 G2011 E0056 E0055 F5011 F1076 G2748 F5025" | |
] | |
}, | |
"execution_count": 69, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"samples = pd.read_csv('newborn_samples.csv', index_col=0)\n", | |
"samples.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 77, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['H3818', 'E0070', 'G2743', ..., 'F15078', 'F15126', 'E14412'], dtype=object)" | |
] | |
}, | |
"execution_count": 77, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"exclude = np.array(samples).ravel()\n", | |
"exclude" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"try_again = [tuple(i) for i in np.array(try_again, int)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 90, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"exclude = [i for i in exclude if i in newborns_complete.index]\n", | |
"newborns_remainder = newborns_complete.drop(exclude)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"new_samples = []\n", | |
"k = 10\n", | |
"\n", | |
"for ys in try_again:\n", | |
" # Get number of cases\n", | |
" n = 1\n", | |
" # Randomize list of all corresponding newborns\n", | |
" newborns_perm = np.random.permutation(newborns_remainder.index[newborns_remainder.birth_year_season==ys].tolist())\n", | |
" # Select k newborns for each case\n", | |
" sample = newborns_perm[:(n*k)]\n", | |
" # Verify that we got enough samples\n", | |
" if len(sample)!=(n*k):\n", | |
" print 'Only {0} newborns in {1} when {2} were required'.format(len(sample), ys, n*k)\n", | |
" new_samples += np.reshape(sample, (n,k)).tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 118, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"len(np.unique(np.ravel(new_samples))) == len(np.ravel(new_samples))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"new_samples_df = pd.DataFrame(new_samples)\n", | |
"#new_samples_df.to_csv('replacement_newborn_samples.csv')" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment