Skip to content

Instantly share code, notes, and snippets.

@brendancol
Created February 13, 2017 05:47
Show Gist options
  • Save brendancol/96e2e08dbab57ff3b1c0375b043b63b6 to your computer and use it in GitHub Desktop.
Save brendancol/96e2e08dbab57ff3b1c0375b043b63b6 to your computer and use it in GitHub Desktop.
Ethnicity from baby names NYC
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import pandas as pd
import numpy as np
from functools import partial
def encode_data(df, cols):
encoders = dict()
for c in cols:
le = preprocessing.LabelEncoder()
le.fit(df[c])
df[c] = le.transform(df[c])
encoders[c] = le
return df, encoders
# encode categorical values as ints
df = pd.read_csv('nyc_babies.csv')
eth_norm = lambda r, race: race if race in r['ETHCTY'] else r['ETHCTY']
df['ETHCTY'] = df.apply(partial(eth_norm, race='ASIAN'), axis=1)
df['ETHCTY'] = df.apply(partial(eth_norm, race='BLACK'), axis=1)
df['ETHCTY'] = df.apply(partial(eth_norm, race='WHITE'), axis=1)
df, encoders = encode_data(df, ['BRTH_YR', 'GNDR', 'ETHCTY', 'NM'])
# duplicate row based on count field
rdf = pd.DataFrame(np.repeat(df.values, df['CNT'].values, axis=0))
rdf.columns = df.columns
# divide up training and test data
rdf['is_train'] = np.random.uniform(0, 1, len(rdf)) <= .75
train, test = rdf[rdf['is_train']], rdf[~rdf['is_train']]
# create, fit, predict
features = ['NM', 'GNDR', 'BRTH_YR']
clf = RandomForestClassifier(n_jobs=4)
clf.fit(train[features], train['ETHCTY'])
pred = clf.predict(test[features])
pred = encoders['ETHCTY'].inverse_transform(pred)
actual_names = encoders['ETHCTY'].inverse_transform(test['ETHCTY'])
ct = pd.crosstab(actual_names,
pred,
rownames=['actual'],
colnames=['preds']).apply(lambda r: r/r.sum(), axis=1)
print(ct)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment