Skip to content

Instantly share code, notes, and snippets.

@Prooffreader
Last active December 23, 2016 22:36
Show Gist options
  • Save Prooffreader/d052ae788b7cb15354fca9709d45af57 to your computer and use it in GitHub Desktop.
Save Prooffreader/d052ae788b7cb15354fca9709d45af57 to your computer and use it in GitHub Desktop.
baby_names_for_RR
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>sex</th>\n",
" <th>year</th>\n",
" <th>freq</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>610282</th>\n",
" <td>Sullivan</td>\n",
" <td>M</td>\n",
" <td>2004</td>\n",
" <td>177</td>\n",
" </tr>\n",
" <tr>\n",
" <th>167599</th>\n",
" <td>Ketra</td>\n",
" <td>F</td>\n",
" <td>1988</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>412976</th>\n",
" <td>Kacey</td>\n",
" <td>F</td>\n",
" <td>1998</td>\n",
" <td>357</td>\n",
" </tr>\n",
" <tr>\n",
" <th>685649</th>\n",
" <td>Tishaun</td>\n",
" <td>M</td>\n",
" <td>2006</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>325223</th>\n",
" <td>Pawel</td>\n",
" <td>M</td>\n",
" <td>1994</td>\n",
" <td>29</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name sex year freq\n",
"610282 Sullivan M 2004 177\n",
"167599 Ketra F 1988 9\n",
"412976 Kacey F 1998 357\n",
"685649 Tishaun M 2006 7\n",
"325223 Pawel M 1994 29"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df1 = pd.read_pickle(r'E:\\Dropbox\\Baby_names_US\\yob1940.pickle')\n",
"df1 = df1[['name', 'sex', 'year', 'births']] # get rid of extra columns\n",
"df1 = df1[dfo.year >= 1980] # just the '80s+\n",
"df1 = df1.rename(columns={'births': 'freq'}) # to match Roberto's df\n",
"df1 = df1.reset_index(drop=True)\n",
"df1.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>En</th>\n",
" <th>Fra</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>William</td>\n",
" <td>Guillaume</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Mary</td>\n",
" <td>Marie</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Stephen</td>\n",
" <td>Stephane</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Steven</td>\n",
" <td>Etienne</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" En Fra\n",
"0 William Guillaume\n",
"1 Mary Marie\n",
"2 Stephen Stephane\n",
"3 Steven Etienne\n",
"4 Emily Emilie"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"name_pairs = [['William', 'Guillaume'],\n",
" ['Mary', 'Marie'],\n",
" ['Stephen', 'Stephane'],\n",
" ['Steven', 'Etienne'],\n",
" ['Emily', 'Emilie']]\n",
"\n",
"df2 = pd.DataFrame(name_pairs, columns=['En', 'Fra'])\n",
"df2 "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['Andreah',\n",
" 'Arabel',\n",
" 'Draper',\n",
" 'Emilie',\n",
" 'Emily',\n",
" 'Etienne',\n",
" 'Guillaume',\n",
" 'Keyondre',\n",
" 'Kolter',\n",
" 'Marie',\n",
" 'Mary',\n",
" 'Nadea',\n",
" 'Sarit',\n",
" 'Shanell',\n",
" 'Stephane',\n",
" 'Stephen',\n",
" 'Steven',\n",
" 'Tinsley',\n",
" 'William',\n",
" 'Yamir']"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# make a flat set of all the names in df2\n",
"names_in_df2 = set(list(df2.Eng) + list(df2.Fra))\n",
"\n",
"# add a few names just so the df2 don't match 1:1\n",
"extra_names = set(list(df1.sample(10).name))\n",
"\n",
"names_to_keep = names_in_df2\n",
"names_to_keep.update(extra_names)\n",
"\n",
"sorted(list(names_to_keep))"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"928385\n",
"764\n"
]
}
],
"source": [
"# limit df1 to these names\n",
"print(len(df1))\n",
"df1 = df1[df1.name.isin(names_to_keep)]\n",
"print(len(df1))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>sex</th>\n",
" <th>births</th>\n",
" <th>year</th>\n",
" <th>pct</th>\n",
" <th>ranked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>695</th>\n",
" <td>Yamir</td>\n",
" <td>M</td>\n",
" <td>43</td>\n",
" <td>2009</td>\n",
" <td>0.002173</td>\n",
" <td>2815.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>696</th>\n",
" <td>Yamir</td>\n",
" <td>M</td>\n",
" <td>39</td>\n",
" <td>2010</td>\n",
" <td>0.002039</td>\n",
" <td>2982.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>697</th>\n",
" <td>Yamir</td>\n",
" <td>M</td>\n",
" <td>44</td>\n",
" <td>2011</td>\n",
" <td>0.002326</td>\n",
" <td>2711.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>698</th>\n",
" <td>Yamir</td>\n",
" <td>M</td>\n",
" <td>36</td>\n",
" <td>2012</td>\n",
" <td>0.001908</td>\n",
" <td>3133.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>699</th>\n",
" <td>Yamir</td>\n",
" <td>M</td>\n",
" <td>35</td>\n",
" <td>2013</td>\n",
" <td>0.001870</td>\n",
" <td>3130.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name sex births year pct ranked\n",
"695 Yamir M 43 2009 0.002173 2815.5\n",
"696 Yamir M 39 2010 0.002039 2982.0\n",
"697 Yamir M 44 2011 0.002326 2711.0\n",
"698 Yamir M 36 2012 0.001908 3133.5\n",
"699 Yamir M 35 2013 0.001870 3130.0"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# that's a lot shorter. We'll remove about some rows as well so the years aren't always\n",
"# necessarily contiguous, to prevent that being an edge case source of error\n",
"df1 = df1.sample(700).sort_values(['name', 'sex', 'year']).reset_index(drop=True)\n",
"df1.tail()"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 35.6 s\n"
]
}
],
"source": [
"%%time\n",
"# here's the way I did it in my earlier pastebin\n",
"# I still can't figure out what went wrong, it seems to work here.\n",
"\n",
"from itertools import product\n",
"\n",
"new_df_rows = []\n",
"\n",
"for idx, row in df2.iterrows():\n",
" for sex, year in product(list(df1.sex.unique()),\n",
" list(range(df1.year.min(), df1.year.max()+1))):\n",
" df_eng = df1[(df1.name == row.En)&(df1.sex == sex)&(df1.year==year)]\n",
" if len(df_eng) == 0:\n",
" freq_eng = 0\n",
" elif len(df_eng) == 1:\n",
" freq_eng = df_eng.freq.iloc[0]\n",
" else:\n",
" raise Exception('name-sex-year is not unique in df1')\n",
" df_fra = df1[(df1.name == row.Fra)&(df1.sex == sex)&(df1.year==year)]\n",
" if len(df_fra) == 0:\n",
" freq_fra = 0\n",
" elif len(df_fra) == 1:\n",
" freq_fra = df_fra.freq.iloc[0]\n",
" else:\n",
" raise Exception('name-sex-year is not unique in df1')\n",
" if freq_eng > 0 and freq_fra > 0:\n",
" new_df_rows.append({'eng': row.En, 'fra': row.Fra,\n",
" 'sex': sex, 'year': year,\n",
" 'eng_freq': freq_eng, 'fra_freq': freq_fra})\n",
"\n",
"df3 = pd.DataFrame(new_df_rows)\n",
"df3 = df3[['eng', 'fra', 'sex', 'year', 'eng_freq', 'fra_freq']] # because dict columns aren't in order"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>eng</th>\n",
" <th>fra</th>\n",
" <th>sex</th>\n",
" <th>year</th>\n",
" <th>eng_freq</th>\n",
" <th>fra_freq</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>188</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1985</td>\n",
" <td>13133</td>\n",
" <td>285</td>\n",
" </tr>\n",
" <tr>\n",
" <th>189</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1986</td>\n",
" <td>13232</td>\n",
" <td>237</td>\n",
" </tr>\n",
" <tr>\n",
" <th>190</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1987</td>\n",
" <td>14711</td>\n",
" <td>277</td>\n",
" </tr>\n",
" <tr>\n",
" <th>191</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1988</td>\n",
" <td>15860</td>\n",
" <td>309</td>\n",
" </tr>\n",
" <tr>\n",
" <th>192</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1989</td>\n",
" <td>17323</td>\n",
" <td>305</td>\n",
" </tr>\n",
" <tr>\n",
" <th>193</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1990</td>\n",
" <td>19356</td>\n",
" <td>365</td>\n",
" </tr>\n",
" <tr>\n",
" <th>194</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1991</td>\n",
" <td>20311</td>\n",
" <td>363</td>\n",
" </tr>\n",
" <tr>\n",
" <th>195</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1992</td>\n",
" <td>21832</td>\n",
" <td>387</td>\n",
" </tr>\n",
" <tr>\n",
" <th>196</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1993</td>\n",
" <td>23590</td>\n",
" <td>455</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1994</td>\n",
" <td>24149</td>\n",
" <td>486</td>\n",
" </tr>\n",
" <tr>\n",
" <th>198</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1995</td>\n",
" <td>24379</td>\n",
" <td>482</td>\n",
" </tr>\n",
" <tr>\n",
" <th>199</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1996</td>\n",
" <td>25148</td>\n",
" <td>519</td>\n",
" </tr>\n",
" <tr>\n",
" <th>200</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1997</td>\n",
" <td>25729</td>\n",
" <td>525</td>\n",
" </tr>\n",
" <tr>\n",
" <th>201</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1998</td>\n",
" <td>26177</td>\n",
" <td>486</td>\n",
" </tr>\n",
" <tr>\n",
" <th>202</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>1999</td>\n",
" <td>26535</td>\n",
" <td>555</td>\n",
" </tr>\n",
" <tr>\n",
" <th>203</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2000</td>\n",
" <td>25952</td>\n",
" <td>562</td>\n",
" </tr>\n",
" <tr>\n",
" <th>204</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2001</td>\n",
" <td>25051</td>\n",
" <td>625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>205</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2002</td>\n",
" <td>24456</td>\n",
" <td>624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>206</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2003</td>\n",
" <td>25683</td>\n",
" <td>659</td>\n",
" </tr>\n",
" <tr>\n",
" <th>207</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2004</td>\n",
" <td>25021</td>\n",
" <td>658</td>\n",
" </tr>\n",
" <tr>\n",
" <th>208</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2005</td>\n",
" <td>23925</td>\n",
" <td>623</td>\n",
" </tr>\n",
" <tr>\n",
" <th>209</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2006</td>\n",
" <td>21385</td>\n",
" <td>630</td>\n",
" </tr>\n",
" <tr>\n",
" <th>210</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2007</td>\n",
" <td>19340</td>\n",
" <td>509</td>\n",
" </tr>\n",
" <tr>\n",
" <th>211</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2008</td>\n",
" <td>17419</td>\n",
" <td>442</td>\n",
" </tr>\n",
" <tr>\n",
" <th>212</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2009</td>\n",
" <td>15333</td>\n",
" <td>401</td>\n",
" </tr>\n",
" <tr>\n",
" <th>213</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2010</td>\n",
" <td>14256</td>\n",
" <td>380</td>\n",
" </tr>\n",
" <tr>\n",
" <th>214</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2011</td>\n",
" <td>14228</td>\n",
" <td>386</td>\n",
" </tr>\n",
" <tr>\n",
" <th>215</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2012</td>\n",
" <td>13606</td>\n",
" <td>377</td>\n",
" </tr>\n",
" <tr>\n",
" <th>216</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>F</td>\n",
" <td>2013</td>\n",
" <td>13044</td>\n",
" <td>515</td>\n",
" </tr>\n",
" <tr>\n",
" <th>217</th>\n",
" <td>Emily</td>\n",
" <td>Emilie</td>\n",
" <td>M</td>\n",
" <td>1989</td>\n",
" <td>82</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" eng fra sex year eng_freq fra_freq\n",
"188 Emily Emilie F 1985 13133 285\n",
"189 Emily Emilie F 1986 13232 237\n",
"190 Emily Emilie F 1987 14711 277\n",
"191 Emily Emilie F 1988 15860 309\n",
"192 Emily Emilie F 1989 17323 305\n",
"193 Emily Emilie F 1990 19356 365\n",
"194 Emily Emilie F 1991 20311 363\n",
"195 Emily Emilie F 1992 21832 387\n",
"196 Emily Emilie F 1993 23590 455\n",
"197 Emily Emilie F 1994 24149 486\n",
"198 Emily Emilie F 1995 24379 482\n",
"199 Emily Emilie F 1996 25148 519\n",
"200 Emily Emilie F 1997 25729 525\n",
"201 Emily Emilie F 1998 26177 486\n",
"202 Emily Emilie F 1999 26535 555\n",
"203 Emily Emilie F 2000 25952 562\n",
"204 Emily Emilie F 2001 25051 625\n",
"205 Emily Emilie F 2002 24456 624\n",
"206 Emily Emilie F 2003 25683 659\n",
"207 Emily Emilie F 2004 25021 658\n",
"208 Emily Emilie F 2005 23925 623\n",
"209 Emily Emilie F 2006 21385 630\n",
"210 Emily Emilie F 2007 19340 509\n",
"211 Emily Emilie F 2008 17419 442\n",
"212 Emily Emilie F 2009 15333 401\n",
"213 Emily Emilie F 2010 14256 380\n",
"214 Emily Emilie F 2011 14228 386\n",
"215 Emily Emilie F 2012 13606 377\n",
"216 Emily Emilie F 2013 13044 515\n",
"217 Emily Emilie M 1989 82 7"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Okay, 35 seconds for that is ridiculous. Much efficiency can be had.\n",
"# let's check out the results:\n",
"df3.tail(30)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# that's typical for the US dataset in the late 80s, lots of errors"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[['William', 'Guillaume'],\n",
" ['Mary', 'Marie'],\n",
" ['Stephen', 'Stephane'],\n",
" ['Steven', 'Etienne'],\n",
" ['Emily', 'Emilie']]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Now let's get real. First of all, we don't need df2 at all, we can just go to a list of lists\n",
"# that will be faster and easier to keep track of.\n",
"name_pairs = df2.values.tolist()\n",
"name_pairs"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Wall time: 27 ms\n"
]
}
],
"source": [
"%%time\n",
"df_final = pd.DataFrame()\n",
"for name_en, name_fra in name_pairs:\n",
" df_en = df1[df1.name == name_en]\n",
" df_en = df_en.rename(columns={'freq': 'freq_en', 'name': 'name_en'})\n",
" df_fra = df1[df1.name == name_fra]\n",
" df_fra = df_fra.rename(columns={'freq': 'freq_fra', 'name': 'name_fra'})\n",
" df_both = pd.merge(df_en, df_fra, on=['sex', 'year'], how='outer')\n",
" # replace NaNs in merged names with the names themselves\n",
" df_both['name_en'] = df_both['name_en'].fillna(name_en)\n",
" df_both['name_fra'] = df_both['name_fra'].fillna(name_fra)\n",
" df_final = pd.concat([df_final, df_both])\n",
" \n",
"df_final = df_final.fillna(0)\n",
"df_final = df_final.reset_index(drop=True)\n",
"# because NaNs are floats, we have to re-coerce back to integer\n",
"for column in ['year', 'freq_en', 'freq_fra']:\n",
" df_final[column] = df_final[column].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name_en</th>\n",
" <th>sex</th>\n",
" <th>year</th>\n",
" <th>freq_en</th>\n",
" <th>name_fra</th>\n",
" <th>freq_fra</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1980</td>\n",
" <td>215</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1980</td>\n",
" <td>25659</td>\n",
" <td>Guillaume</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1981</td>\n",
" <td>171</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1981</td>\n",
" <td>24788</td>\n",
" <td>Guillaume</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1982</td>\n",
" <td>178</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1982</td>\n",
" <td>25594</td>\n",
" <td>Guillaume</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1983</td>\n",
" <td>186</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1983</td>\n",
" <td>25381</td>\n",
" <td>Guillaume</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1984</td>\n",
" <td>158</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1984</td>\n",
" <td>24886</td>\n",
" <td>Guillaume</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1985</td>\n",
" <td>183</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1985</td>\n",
" <td>24607</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1986</td>\n",
" <td>165</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1986</td>\n",
" <td>24317</td>\n",
" <td>Guillaume</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1987</td>\n",
" <td>169</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1987</td>\n",
" <td>24199</td>\n",
" <td>Guillaume</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1988</td>\n",
" <td>164</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1988</td>\n",
" <td>24109</td>\n",
" <td>Guillaume</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1989</td>\n",
" <td>94</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1989</td>\n",
" <td>24674</td>\n",
" <td>Guillaume</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1990</td>\n",
" <td>61</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1990</td>\n",
" <td>24882</td>\n",
" <td>Guillaume</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1991</td>\n",
" <td>56</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1991</td>\n",
" <td>23863</td>\n",
" <td>Guillaume</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1992</td>\n",
" <td>60</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1992</td>\n",
" <td>23059</td>\n",
" <td>Guillaume</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1993</td>\n",
" <td>42</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1993</td>\n",
" <td>22207</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>William</td>\n",
" <td>F</td>\n",
" <td>1994</td>\n",
" <td>45</td>\n",
" <td>Guillaume</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>William</td>\n",
" <td>M</td>\n",
" <td>1994</td>\n",
" <td>21487</td>\n",
" <td>Guillaume</td>\n",
" <td>7</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name_en sex year freq_en name_fra freq_fra\n",
"0 William F 1980 215 Guillaume 0\n",
"1 William M 1980 25659 Guillaume 7\n",
"2 William F 1981 171 Guillaume 0\n",
"3 William M 1981 24788 Guillaume 8\n",
"4 William F 1982 178 Guillaume 0\n",
"5 William M 1982 25594 Guillaume 10\n",
"6 William F 1983 186 Guillaume 0\n",
"7 William M 1983 25381 Guillaume 8\n",
"8 William F 1984 158 Guillaume 0\n",
"9 William M 1984 24886 Guillaume 5\n",
"10 William F 1985 183 Guillaume 0\n",
"11 William M 1985 24607 Guillaume 0\n",
"12 William F 1986 165 Guillaume 0\n",
"13 William M 1986 24317 Guillaume 7\n",
"14 William F 1987 169 Guillaume 0\n",
"15 William M 1987 24199 Guillaume 9\n",
"16 William F 1988 164 Guillaume 0\n",
"17 William M 1988 24109 Guillaume 11\n",
"18 William F 1989 94 Guillaume 0\n",
"19 William M 1989 24674 Guillaume 9\n",
"20 William F 1990 61 Guillaume 0\n",
"21 William M 1990 24882 Guillaume 10\n",
"22 William F 1991 56 Guillaume 0\n",
"23 William M 1991 23863 Guillaume 8\n",
"24 William F 1992 60 Guillaume 0\n",
"25 William M 1992 23059 Guillaume 15\n",
"26 William F 1993 42 Guillaume 0\n",
"27 William M 1993 22207 Guillaume 0\n",
"28 William F 1994 45 Guillaume 0\n",
"29 William M 1994 21487 Guillaume 7"
]
},
"execution_count": 94,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_final.head(30)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1318.5185185185185\n"
]
}
],
"source": [
"# What is the time savings?\n",
"print(35.6 / (27/1000))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:py35]",
"language": "python",
"name": "conda-env-py35-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment