Last active
December 23, 2016 22:36
-
-
Save Prooffreader/d052ae788b7cb15354fca9709d45af57 to your computer and use it in GitHub Desktop.
baby_names_for_RR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>sex</th>\n", | |
" <th>year</th>\n", | |
" <th>freq</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>610282</th>\n", | |
" <td>Sullivan</td>\n", | |
" <td>M</td>\n", | |
" <td>2004</td>\n", | |
" <td>177</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>167599</th>\n", | |
" <td>Ketra</td>\n", | |
" <td>F</td>\n", | |
" <td>1988</td>\n", | |
" <td>9</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>412976</th>\n", | |
" <td>Kacey</td>\n", | |
" <td>F</td>\n", | |
" <td>1998</td>\n", | |
" <td>357</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>685649</th>\n", | |
" <td>Tishaun</td>\n", | |
" <td>M</td>\n", | |
" <td>2006</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>325223</th>\n", | |
" <td>Pawel</td>\n", | |
" <td>M</td>\n", | |
" <td>1994</td>\n", | |
" <td>29</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name sex year freq\n", | |
"610282 Sullivan M 2004 177\n", | |
"167599 Ketra F 1988 9\n", | |
"412976 Kacey F 1998 357\n", | |
"685649 Tishaun M 2006 7\n", | |
"325223 Pawel M 1994 29" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df1 = pd.read_pickle(r'E:\\Dropbox\\Baby_names_US\\yob1940.pickle')\n", | |
"df1 = df1[['name', 'sex', 'year', 'births']] # get rid of extra columns\n", | |
"df1 = df1[dfo.year >= 1980] # just the '80s+\n", | |
"df1 = df1.rename(columns={'births': 'freq'}) # to match Roberto's df\n", | |
"df1 = df1.reset_index(drop=True)\n", | |
"df1.sample(5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>En</th>\n", | |
" <th>Fra</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>William</td>\n", | |
" <td>Guillaume</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Mary</td>\n", | |
" <td>Marie</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Stephen</td>\n", | |
" <td>Stephane</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Steven</td>\n", | |
" <td>Etienne</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" En Fra\n", | |
"0 William Guillaume\n", | |
"1 Mary Marie\n", | |
"2 Stephen Stephane\n", | |
"3 Steven Etienne\n", | |
"4 Emily Emilie" | |
] | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"name_pairs = [['William', 'Guillaume'],\n", | |
" ['Mary', 'Marie'],\n", | |
" ['Stephen', 'Stephane'],\n", | |
" ['Steven', 'Etienne'],\n", | |
" ['Emily', 'Emilie']]\n", | |
"\n", | |
"df2 = pd.DataFrame(name_pairs, columns=['En', 'Fra'])\n", | |
"df2 " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['Andreah',\n", | |
" 'Arabel',\n", | |
" 'Draper',\n", | |
" 'Emilie',\n", | |
" 'Emily',\n", | |
" 'Etienne',\n", | |
" 'Guillaume',\n", | |
" 'Keyondre',\n", | |
" 'Kolter',\n", | |
" 'Marie',\n", | |
" 'Mary',\n", | |
" 'Nadea',\n", | |
" 'Sarit',\n", | |
" 'Shanell',\n", | |
" 'Stephane',\n", | |
" 'Stephen',\n", | |
" 'Steven',\n", | |
" 'Tinsley',\n", | |
" 'William',\n", | |
" 'Yamir']" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# make a flat set of all the names in df2\n", | |
"names_in_df2 = set(list(df2.Eng) + list(df2.Fra))\n", | |
"\n", | |
"# add a few names just so the df2 don't match 1:1\n", | |
"extra_names = set(list(df1.sample(10).name))\n", | |
"\n", | |
"names_to_keep = names_in_df2\n", | |
"names_to_keep.update(extra_names)\n", | |
"\n", | |
"sorted(list(names_to_keep))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 70, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"928385\n", | |
"764\n" | |
] | |
} | |
], | |
"source": [ | |
"# limit df1 to these names\n", | |
"print(len(df1))\n", | |
"df1 = df1[df1.name.isin(names_to_keep)]\n", | |
"print(len(df1))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>sex</th>\n", | |
" <th>births</th>\n", | |
" <th>year</th>\n", | |
" <th>pct</th>\n", | |
" <th>ranked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>695</th>\n", | |
" <td>Yamir</td>\n", | |
" <td>M</td>\n", | |
" <td>43</td>\n", | |
" <td>2009</td>\n", | |
" <td>0.002173</td>\n", | |
" <td>2815.5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>696</th>\n", | |
" <td>Yamir</td>\n", | |
" <td>M</td>\n", | |
" <td>39</td>\n", | |
" <td>2010</td>\n", | |
" <td>0.002039</td>\n", | |
" <td>2982.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>697</th>\n", | |
" <td>Yamir</td>\n", | |
" <td>M</td>\n", | |
" <td>44</td>\n", | |
" <td>2011</td>\n", | |
" <td>0.002326</td>\n", | |
" <td>2711.0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>698</th>\n", | |
" <td>Yamir</td>\n", | |
" <td>M</td>\n", | |
" <td>36</td>\n", | |
" <td>2012</td>\n", | |
" <td>0.001908</td>\n", | |
" <td>3133.5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>699</th>\n", | |
" <td>Yamir</td>\n", | |
" <td>M</td>\n", | |
" <td>35</td>\n", | |
" <td>2013</td>\n", | |
" <td>0.001870</td>\n", | |
" <td>3130.0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name sex births year pct ranked\n", | |
"695 Yamir M 43 2009 0.002173 2815.5\n", | |
"696 Yamir M 39 2010 0.002039 2982.0\n", | |
"697 Yamir M 44 2011 0.002326 2711.0\n", | |
"698 Yamir M 36 2012 0.001908 3133.5\n", | |
"699 Yamir M 35 2013 0.001870 3130.0" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# that's a lot shorter. We'll remove about some rows as well so the years aren't always\n", | |
"# necessarily contiguous, to prevent that being an edge case source of error\n", | |
"df1 = df1.sample(700).sort_values(['name', 'sex', 'year']).reset_index(drop=True)\n", | |
"df1.tail()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wall time: 35.6 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"# here's the way I did it in my earlier pastebin\n", | |
"# I still can't figure out what went wrong, it seems to work here.\n", | |
"\n", | |
"from itertools import product\n", | |
"\n", | |
"new_df_rows = []\n", | |
"\n", | |
"for idx, row in df2.iterrows():\n", | |
" for sex, year in product(list(df1.sex.unique()),\n", | |
" list(range(df1.year.min(), df1.year.max()+1))):\n", | |
" df_eng = df1[(df1.name == row.En)&(df1.sex == sex)&(df1.year==year)]\n", | |
" if len(df_eng) == 0:\n", | |
" freq_eng = 0\n", | |
" elif len(df_eng) == 1:\n", | |
" freq_eng = df_eng.freq.iloc[0]\n", | |
" else:\n", | |
" raise Exception('name-sex-year is not unique in df1')\n", | |
" df_fra = df1[(df1.name == row.Fra)&(df1.sex == sex)&(df1.year==year)]\n", | |
" if len(df_fra) == 0:\n", | |
" freq_fra = 0\n", | |
" elif len(df_fra) == 1:\n", | |
" freq_fra = df_fra.freq.iloc[0]\n", | |
" else:\n", | |
" raise Exception('name-sex-year is not unique in df1')\n", | |
" if freq_eng > 0 and freq_fra > 0:\n", | |
" new_df_rows.append({'eng': row.En, 'fra': row.Fra,\n", | |
" 'sex': sex, 'year': year,\n", | |
" 'eng_freq': freq_eng, 'fra_freq': freq_fra})\n", | |
"\n", | |
"df3 = pd.DataFrame(new_df_rows)\n", | |
"df3 = df3[['eng', 'fra', 'sex', 'year', 'eng_freq', 'fra_freq']] # because dict columns aren't in order" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>eng</th>\n", | |
" <th>fra</th>\n", | |
" <th>sex</th>\n", | |
" <th>year</th>\n", | |
" <th>eng_freq</th>\n", | |
" <th>fra_freq</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>188</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1985</td>\n", | |
" <td>13133</td>\n", | |
" <td>285</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>189</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1986</td>\n", | |
" <td>13232</td>\n", | |
" <td>237</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>190</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1987</td>\n", | |
" <td>14711</td>\n", | |
" <td>277</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>191</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1988</td>\n", | |
" <td>15860</td>\n", | |
" <td>309</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>192</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1989</td>\n", | |
" <td>17323</td>\n", | |
" <td>305</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>193</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1990</td>\n", | |
" <td>19356</td>\n", | |
" <td>365</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>194</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1991</td>\n", | |
" <td>20311</td>\n", | |
" <td>363</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>195</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1992</td>\n", | |
" <td>21832</td>\n", | |
" <td>387</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>196</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1993</td>\n", | |
" <td>23590</td>\n", | |
" <td>455</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>197</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1994</td>\n", | |
" <td>24149</td>\n", | |
" <td>486</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>198</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1995</td>\n", | |
" <td>24379</td>\n", | |
" <td>482</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>199</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1996</td>\n", | |
" <td>25148</td>\n", | |
" <td>519</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>200</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1997</td>\n", | |
" <td>25729</td>\n", | |
" <td>525</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>201</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1998</td>\n", | |
" <td>26177</td>\n", | |
" <td>486</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>202</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>1999</td>\n", | |
" <td>26535</td>\n", | |
" <td>555</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>203</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2000</td>\n", | |
" <td>25952</td>\n", | |
" <td>562</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>204</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2001</td>\n", | |
" <td>25051</td>\n", | |
" <td>625</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>205</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2002</td>\n", | |
" <td>24456</td>\n", | |
" <td>624</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>206</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2003</td>\n", | |
" <td>25683</td>\n", | |
" <td>659</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>207</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2004</td>\n", | |
" <td>25021</td>\n", | |
" <td>658</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>208</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2005</td>\n", | |
" <td>23925</td>\n", | |
" <td>623</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>209</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2006</td>\n", | |
" <td>21385</td>\n", | |
" <td>630</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>210</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2007</td>\n", | |
" <td>19340</td>\n", | |
" <td>509</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>211</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2008</td>\n", | |
" <td>17419</td>\n", | |
" <td>442</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>212</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2009</td>\n", | |
" <td>15333</td>\n", | |
" <td>401</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>213</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2010</td>\n", | |
" <td>14256</td>\n", | |
" <td>380</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>214</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2011</td>\n", | |
" <td>14228</td>\n", | |
" <td>386</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>215</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2012</td>\n", | |
" <td>13606</td>\n", | |
" <td>377</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>216</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>F</td>\n", | |
" <td>2013</td>\n", | |
" <td>13044</td>\n", | |
" <td>515</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>217</th>\n", | |
" <td>Emily</td>\n", | |
" <td>Emilie</td>\n", | |
" <td>M</td>\n", | |
" <td>1989</td>\n", | |
" <td>82</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" eng fra sex year eng_freq fra_freq\n", | |
"188 Emily Emilie F 1985 13133 285\n", | |
"189 Emily Emilie F 1986 13232 237\n", | |
"190 Emily Emilie F 1987 14711 277\n", | |
"191 Emily Emilie F 1988 15860 309\n", | |
"192 Emily Emilie F 1989 17323 305\n", | |
"193 Emily Emilie F 1990 19356 365\n", | |
"194 Emily Emilie F 1991 20311 363\n", | |
"195 Emily Emilie F 1992 21832 387\n", | |
"196 Emily Emilie F 1993 23590 455\n", | |
"197 Emily Emilie F 1994 24149 486\n", | |
"198 Emily Emilie F 1995 24379 482\n", | |
"199 Emily Emilie F 1996 25148 519\n", | |
"200 Emily Emilie F 1997 25729 525\n", | |
"201 Emily Emilie F 1998 26177 486\n", | |
"202 Emily Emilie F 1999 26535 555\n", | |
"203 Emily Emilie F 2000 25952 562\n", | |
"204 Emily Emilie F 2001 25051 625\n", | |
"205 Emily Emilie F 2002 24456 624\n", | |
"206 Emily Emilie F 2003 25683 659\n", | |
"207 Emily Emilie F 2004 25021 658\n", | |
"208 Emily Emilie F 2005 23925 623\n", | |
"209 Emily Emilie F 2006 21385 630\n", | |
"210 Emily Emilie F 2007 19340 509\n", | |
"211 Emily Emilie F 2008 17419 442\n", | |
"212 Emily Emilie F 2009 15333 401\n", | |
"213 Emily Emilie F 2010 14256 380\n", | |
"214 Emily Emilie F 2011 14228 386\n", | |
"215 Emily Emilie F 2012 13606 377\n", | |
"216 Emily Emilie F 2013 13044 515\n", | |
"217 Emily Emilie M 1989 82 7" | |
] | |
}, | |
"execution_count": 43, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Okay, 35 seconds for that is ridiculous. Much efficiency can be had.\n", | |
"# let's check out the results:\n", | |
"df3.tail(30)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# that's typical for the US dataset in the late 80s, lots of errors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 50, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[['William', 'Guillaume'],\n", | |
" ['Mary', 'Marie'],\n", | |
" ['Stephen', 'Stephane'],\n", | |
" ['Steven', 'Etienne'],\n", | |
" ['Emily', 'Emilie']]" | |
] | |
}, | |
"execution_count": 50, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Now let's get real. First of all, we don't need df2 at all, we can just go to a list of lists\n", | |
"# that will be faster and easier to keep track of.\n", | |
"name_pairs = df2.values.tolist()\n", | |
"name_pairs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 93, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Wall time: 27 ms\n" | |
] | |
} | |
], | |
"source": [ | |
"%%time\n", | |
"df_final = pd.DataFrame()\n", | |
"for name_en, name_fra in name_pairs:\n", | |
" df_en = df1[df1.name == name_en]\n", | |
" df_en = df_en.rename(columns={'freq': 'freq_en', 'name': 'name_en'})\n", | |
" df_fra = df1[df1.name == name_fra]\n", | |
" df_fra = df_fra.rename(columns={'freq': 'freq_fra', 'name': 'name_fra'})\n", | |
" df_both = pd.merge(df_en, df_fra, on=['sex', 'year'], how='outer')\n", | |
" # replace NaNs in merged names with the names themselves\n", | |
" df_both['name_en'] = df_both['name_en'].fillna(name_en)\n", | |
" df_both['name_fra'] = df_both['name_fra'].fillna(name_fra)\n", | |
" df_final = pd.concat([df_final, df_both])\n", | |
" \n", | |
"df_final = df_final.fillna(0)\n", | |
"df_final = df_final.reset_index(drop=True)\n", | |
"# because NaNs are floats, we have to re-coerce back to integer\n", | |
"for column in ['year', 'freq_en', 'freq_fra']:\n", | |
" df_final[column] = df_final[column].astype(int)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name_en</th>\n", | |
" <th>sex</th>\n", | |
" <th>year</th>\n", | |
" <th>freq_en</th>\n", | |
" <th>name_fra</th>\n", | |
" <th>freq_fra</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1980</td>\n", | |
" <td>215</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1980</td>\n", | |
" <td>25659</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1981</td>\n", | |
" <td>171</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1981</td>\n", | |
" <td>24788</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1982</td>\n", | |
" <td>178</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1982</td>\n", | |
" <td>25594</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1983</td>\n", | |
" <td>186</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>7</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1983</td>\n", | |
" <td>25381</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>8</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1984</td>\n", | |
" <td>158</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1984</td>\n", | |
" <td>24886</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>5</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1985</td>\n", | |
" <td>183</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1985</td>\n", | |
" <td>24607</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1986</td>\n", | |
" <td>165</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>13</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1986</td>\n", | |
" <td>24317</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1987</td>\n", | |
" <td>169</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>15</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1987</td>\n", | |
" <td>24199</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>9</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1988</td>\n", | |
" <td>164</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>17</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1988</td>\n", | |
" <td>24109</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>11</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1989</td>\n", | |
" <td>94</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1989</td>\n", | |
" <td>24674</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>9</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>20</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1990</td>\n", | |
" <td>61</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>21</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1990</td>\n", | |
" <td>24882</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>10</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>22</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1991</td>\n", | |
" <td>56</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>23</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1991</td>\n", | |
" <td>23863</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>8</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>24</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1992</td>\n", | |
" <td>60</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>25</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1992</td>\n", | |
" <td>23059</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>15</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>26</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1993</td>\n", | |
" <td>42</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1993</td>\n", | |
" <td>22207</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>28</th>\n", | |
" <td>William</td>\n", | |
" <td>F</td>\n", | |
" <td>1994</td>\n", | |
" <td>45</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>29</th>\n", | |
" <td>William</td>\n", | |
" <td>M</td>\n", | |
" <td>1994</td>\n", | |
" <td>21487</td>\n", | |
" <td>Guillaume</td>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name_en sex year freq_en name_fra freq_fra\n", | |
"0 William F 1980 215 Guillaume 0\n", | |
"1 William M 1980 25659 Guillaume 7\n", | |
"2 William F 1981 171 Guillaume 0\n", | |
"3 William M 1981 24788 Guillaume 8\n", | |
"4 William F 1982 178 Guillaume 0\n", | |
"5 William M 1982 25594 Guillaume 10\n", | |
"6 William F 1983 186 Guillaume 0\n", | |
"7 William M 1983 25381 Guillaume 8\n", | |
"8 William F 1984 158 Guillaume 0\n", | |
"9 William M 1984 24886 Guillaume 5\n", | |
"10 William F 1985 183 Guillaume 0\n", | |
"11 William M 1985 24607 Guillaume 0\n", | |
"12 William F 1986 165 Guillaume 0\n", | |
"13 William M 1986 24317 Guillaume 7\n", | |
"14 William F 1987 169 Guillaume 0\n", | |
"15 William M 1987 24199 Guillaume 9\n", | |
"16 William F 1988 164 Guillaume 0\n", | |
"17 William M 1988 24109 Guillaume 11\n", | |
"18 William F 1989 94 Guillaume 0\n", | |
"19 William M 1989 24674 Guillaume 9\n", | |
"20 William F 1990 61 Guillaume 0\n", | |
"21 William M 1990 24882 Guillaume 10\n", | |
"22 William F 1991 56 Guillaume 0\n", | |
"23 William M 1991 23863 Guillaume 8\n", | |
"24 William F 1992 60 Guillaume 0\n", | |
"25 William M 1992 23059 Guillaume 15\n", | |
"26 William F 1993 42 Guillaume 0\n", | |
"27 William M 1993 22207 Guillaume 0\n", | |
"28 William F 1994 45 Guillaume 0\n", | |
"29 William M 1994 21487 Guillaume 7" | |
] | |
}, | |
"execution_count": 94, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_final.head(30)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"1318.5185185185185\n" | |
] | |
} | |
], | |
"source": [ | |
"# What is the time savings?\n", | |
"print(35.6 / (27/1000))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [conda env:py35]", | |
"language": "python", | |
"name": "conda-env-py35-py" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment