Skip to content

Instantly share code, notes, and snippets.

@brendancol
Created February 13, 2017 05:47
Show Gist options
  • Save brendancol/f5c2a4dbb8b4e2d58c3a967619951dfa to your computer and use it in GitHub Desktop.
Save brendancol/f5c2a4dbb8b4e2d58c3a967619951dfa to your computer and use it in GitHub Desktop.
Ethnicity from baby names NYC
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import pandas as pd
import numpy as np
from functools import partial
def encode_data(df, cols):
encoders = dict()
for c in cols:
le = preprocessing.LabelEncoder()
le.fit(df[c])
df[c] = le.transform(df[c])
encoders[c] = le
return df, encoders
# encode categorical values as ints
df = pd.read_csv('nyc_babies.csv')
eth_norm = lambda r, race: race if race in r['ETHCTY'] else r['ETHCTY']
df['ETHCTY'] = df.apply(partial(eth_norm, race='ASIAN'), axis=1)
df['ETHCTY'] = df.apply(partial(eth_norm, race='BLACK'), axis=1)
df['ETHCTY'] = df.apply(partial(eth_norm, race='WHITE'), axis=1)
df, encoders = encode_data(df, ['BRTH_YR', 'GNDR', 'ETHCTY', 'NM'])
# duplicate row based on count field
rdf = pd.DataFrame(np.repeat(df.values, df['CNT'].values, axis=0))
rdf.columns = df.columns
# divide up training and test data
rdf['is_train'] = np.random.uniform(0, 1, len(rdf)) <= .75
train, test = rdf[rdf['is_train']], rdf[~rdf['is_train']]
# create, fit, predict
features = ['NM', 'GNDR', 'BRTH_YR']
clf = RandomForestClassifier(n_jobs=4)
clf.fit(train[features], train['ETHCTY'])
pred = clf.predict(test[features])
pred = encoders['ETHCTY'].inverse_transform(pred)
actual_names = encoders['ETHCTY'].inverse_transform(test['ETHCTY'])
ct = pd.crosstab(actual_names,
pred,
rownames=['actual'],
colnames=['preds']).apply(lambda r: r/r.sum(), axis=1)
print(ct)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment