Last active
August 29, 2015 14:03
-
-
Save mjbommar/4cdf9e188c505a328809 to your computer and use it in GitHub Desktop.
Sample is_male name probability estimates, conditioned by name, name/year, name/state, and name/year/state
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "", | |
"signature": "sha256:93ee95c96f30075d52fc69273cdd5b032909e7e1f4a284aaaeae74d821041049" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Imports\n", | |
"import requests\n", | |
"import glob\n", | |
"import pandas" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Read data\n", | |
"name_data = pandas.read_csv('names/all-states', header=None)\n", | |
"name_data.columns = ['state', 'sex', 'year', 'name', 'count']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Map sex to is_male\n", | |
"name_data['is_male'] = (name_data['sex'] == 'M')\n", | |
"name_data['male_count'] = name_data['is_male'] * name_data['count']\n", | |
"\n", | |
"# Map name to LC\n", | |
"name_data['name'] = name_data['name'].apply(str.lower).apply(str.strip)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 44 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"name_data.head()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>state</th>\n", | |
" <th>sex</th>\n", | |
" <th>year</th>\n", | |
" <th>name</th>\n", | |
" <th>count</th>\n", | |
" <th>is_male</th>\n", | |
" <th>male_count</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> AK</td>\n", | |
" <td> F</td>\n", | |
" <td> 1910</td>\n", | |
" <td> mary</td>\n", | |
" <td> 14</td>\n", | |
" <td> False</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> AK</td>\n", | |
" <td> F</td>\n", | |
" <td> 1910</td>\n", | |
" <td> annie</td>\n", | |
" <td> 12</td>\n", | |
" <td> False</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> AK</td>\n", | |
" <td> F</td>\n", | |
" <td> 1910</td>\n", | |
" <td> anna</td>\n", | |
" <td> 10</td>\n", | |
" <td> False</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> AK</td>\n", | |
" <td> F</td>\n", | |
" <td> 1910</td>\n", | |
" <td> margaret</td>\n", | |
" <td> 8</td>\n", | |
" <td> False</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> AK</td>\n", | |
" <td> F</td>\n", | |
" <td> 1910</td>\n", | |
" <td> helen</td>\n", | |
" <td> 7</td>\n", | |
" <td> False</td>\n", | |
" <td> 0</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows \u00d7 7 columns</p>\n", | |
"</div>" | |
], | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 45, | |
"text": [ | |
" state sex year name count is_male male_count\n", | |
"0 AK F 1910 mary 14 False 0\n", | |
"1 AK F 1910 annie 12 False 0\n", | |
"2 AK F 1910 anna 10 False 0\n", | |
"3 AK F 1910 margaret 8 False 0\n", | |
"4 AK F 1910 helen 7 False 0\n", | |
"\n", | |
"[5 rows x 7 columns]" | |
] | |
} | |
], | |
"prompt_number": 45 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# Get probability male by name\n", | |
"prob_male = name_data.groupby('name')['male_count'].sum() / name_data.groupby('name')['count'].sum()\n", | |
"prob_male_year = name_data.groupby(['name', 'year'])['male_count'].sum() / name_data.groupby(['name', 'year'])['count'].sum()\n", | |
"prob_male_state = name_data.groupby(['name', 'state'])['male_count'].sum() / name_data.groupby(['name', 'state'])['count'].sum()\n", | |
"prob_male_year_state = name_data.groupby(['name', 'year', 'state'])['male_count'].sum() / name_data.groupby(['name', 'year', 'state'])['count'].sum()" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"prob_male['michael']" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 57, | |
"text": [ | |
"0.99599974867530949" | |
] | |
} | |
], | |
"prompt_number": 57 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"prob_male_state[('michael', 'CA')]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 58, | |
"text": [ | |
"0.99425400243755246" | |
] | |
} | |
], | |
"prompt_number": 58 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"prob_male_year[('michael', 2000)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 59, | |
"text": [ | |
"0.99825441850316388" | |
] | |
} | |
], | |
"prompt_number": 59 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"prob_male_year[('michael', 1960)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 65, | |
"text": [ | |
"0.99675659039524611" | |
] | |
} | |
], | |
"prompt_number": 65 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"prob_male_year[('chris', 1970)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 71, | |
"text": [ | |
"0.91570959803117313" | |
] | |
} | |
], | |
"prompt_number": 71 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"prob_male_year[('amy', 2000)]" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 73, | |
"text": [ | |
"0.0" | |
] | |
} | |
], | |
"prompt_number": 73 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment