Last active
February 1, 2018 18:23
-
-
Save tomokishii/dac0132bcc9b5b732d41 to your computer and use it in GitHub Desktop.
Adult Dataset (from UCI Machine Learning Repository) classification, 1. Logistic Regression, 2. MLP (Multi-layer Perceptron) model
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# adult_LR_classifier.py date. 10/17/2015 | |
# SGD (Stochastic Gradient Descent) version | |
# consider 3 features of dataset | |
# | |
import numpy as np | |
import numpy.random as rng | |
import matplotlib.pyplot as plt | |
import scipy as sp | |
import scipy.optimize as spo | |
import pandas as pd | |
import timeit | |
import theano | |
import theano.tensor as T | |
def load_data(): | |
def to_float(i): | |
return float(i) | |
def is_rich(labelstr): | |
if '>50K' in labelstr: | |
res = 1.0 | |
elif '<=50K' in labelstr: | |
res = 0.0 | |
else: | |
res = -1.0 | |
return res | |
def to_fami_size(fami_str): | |
fami_str = str.lstrip(fami_str) | |
if fami_str in ('Wife', 'Husband'): | |
res = 2.0 | |
elif fami_str in 'Own-child': | |
res = 4.0 | |
elif fami_str in 'Other-relative': | |
res = 2.0 | |
elif fami_str in ('Not-in-family', 'Unmarried'): | |
res = 1.0 | |
else: | |
res = 0.0 | |
return res | |
colnames = ['age', 'wc', 'dmy1', 'educ', 'edu_num', 'marital', 'occup', | |
'relat', 'race', 'sex', 'cap_g', 'cap_l', 'hrs', 'native', | |
'incom'] | |
mydf = pd.read_csv('adult.data', header=None, names=colnames) | |
mydf.dropna(inplace=True) | |
xmat = np.column_stack((mydf['edu_num'].apply(to_float).values, | |
mydf['relat'].apply(to_fami_size), | |
mydf['hrs'].apply(to_float).values)) | |
ymat = mydf['incom'].apply(is_rich).values | |
return xmat, ymat # shape: xmat [m, ], ymat [m, ] | |
def setup_data(xmat, ymat): | |
# store the data into 'shared' variables to be accessible by Theano | |
def shared_dataset(xm, ym, borrow=True): | |
shared_x = theano.shared(np.asarray(xm, dtype=theano.config.floatX), | |
borrow=borrow) | |
shared_y = theano.shared(np.asarray(ym, dtype=theano.config.floatX), | |
borrow=borrow) | |
# | |
return shared_x, shared_y | |
def data_shuffle(xm, ym, siz): | |
idv = np.arange(siz) | |
idv0 = np.array(idv) # copy numbers | |
np.random.shuffle(idv) | |
xm[idv0] = xm[idv] | |
ym[idv0] = ym[idv] | |
x_new = np.zeros_like(xm) | |
y_new = np.zeros_like(ym) | |
x_new[idv0] = xm[idv] | |
y_new[idv0] = ym[idv] | |
return x_new, y_new | |
total_len = ymat.shape[0] | |
n_features = np.size(xmat) / total_len | |
# Random Shuffle | |
xmat, ymat = data_shuffle(xmat, ymat, total_len) | |
train_len = int(total_len * 0.7) | |
test_len = total_len - train_len | |
xtr, ytr = shared_dataset( | |
(xmat[:train_len]).reshape(train_len, n_features), | |
ymat[:train_len]) | |
xte, yte = shared_dataset( | |
(xmat[train_len:]).reshape(test_len, n_features), | |
ymat[train_len:]) | |
rval = [(xtr, ytr), (xte, yte)] | |
return rval | |
if __name__ == "__main__": | |
np.random.seed(20151017) | |
xmat, ymat = load_data() | |
datasets = setup_data(xmat, ymat) | |
xtr, ytr = datasets[0] # xtr, ytr are vector | |
xte, yte = datasets[1] # xte, yte are vector | |
# Declare Theano symbolic variables | |
xtr_nrow, xtr_ncol = (xtr.get_value()).shape | |
index = T.lscalar() # index to a [mini]batch | |
batch_size = T.lscalar() | |
learning_rate = T.scalar() | |
x = T.matrix('x') | |
y = T.vector('y') | |
w = theano.shared(np.zeros(xtr_ncol), name='w') # w, b <- all zero | |
b = theano.shared(0., name='b') | |
print ' Initial model: ' | |
wi = w.get_value() | |
bi = w.get_value() | |
np.set_printoptions(precision=4) | |
print 'w : ', wi, 'b : ', bi | |
myp = T.nnet.sigmoid(T.dot(x, w) + b) | |
prediction = myp > 0.5 # from theano tutorial | |
xent = T.nnet.binary_crossentropy(myp, y) | |
cost = xent.mean() + 0.01 * (w ** 2).sum() # regularization | |
gw, gb = T.grad(cost, [w, b]) | |
############################################# | |
batch_size = 50 | |
############################################# | |
# Compile | |
train_model = theano.function( | |
inputs=[index, learning_rate], | |
outputs=[cost, prediction], | |
updates=((w, w - learning_rate * gw), (b, b - learning_rate * gb)), | |
givens=[(x, xtr[index * batch_size:(index + 1) * batch_size]), | |
(y, ytr[index * batch_size:(index + 1) * batch_size])], | |
allow_input_downcast=True | |
) | |
predict = theano.function( | |
# inputs=[x], | |
inputs=[], | |
outputs=prediction, | |
givens=[(x, xte)], | |
allow_input_downcast=True | |
) | |
# Train (Optimization) | |
start_time = timeit.default_timer() | |
n_epochs = 50 | |
epoch = 0 | |
lrate_base = 0.01 | |
lrate_coef = 20 | |
n_train_batches = int(ytr.get_value(borrow=True).shape[0] / batch_size) | |
while (epoch < n_epochs): | |
epoch += 1 | |
for mini_batch_index in range(n_train_batches): | |
l_rate = lrate_base * lrate_coef / (epoch + lrate_coef) | |
cost_j, pred = train_model(mini_batch_index, l_rate) | |
print 'epoch[%3d] : cost =%f ' % (epoch, cost_j) | |
# Print result | |
print '\n Final model: ' | |
wf = w.get_value() | |
bf = b.get_value() | |
np.set_printoptions(precision=4) | |
print 'w : ', wf, 'b : ', bf | |
print 'Elapsed time: %10.3f [s]' % (timeit.default_timer() - start_time) | |
mypred = (predict()).flatten() | |
iv_yte = (yte.get_value()).astype(int) | |
accu = (mypred==iv_yte).astype(int) | |
accu = accu.sum() *1.0 / iv_yte.shape[0] | |
print 'accuracy = %12.4f ' % accu | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# adult_data_classifier.py date. 2/4/2016 | |
# | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import sys | |
import numpy as np | |
import numpy.random as rng | |
# import matplotlib.pyplot as plt | |
import pandas as pd | |
import timeit | |
import theano | |
import theano.tensor as T | |
def floatX(X): | |
return np.asarray(X, dtype=theano.config.floatX) | |
def load_data(filename='adult.data'): | |
''' | |
Load "Adult" data set. It has 13 features and 1 label. | |
Features: | |
1. age: continuous. | |
(2.) workclass: Private, Self-emp-not-inc, Self-emp-inc, ... | |
3. fnlwgt: continuous. | |
4. education: Bachelors, Some-college, 11th, HS-grad, Prof-school, | |
Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, | |
Doctorate, 5th-6th, Preschool. | |
5. education-num: continuous. | |
6. marital-status: Married-civ-spouse, Divorced, Never-married, | |
Separated, Widowed, Married-spouse-absent, Married-AF-spouse. | |
(7.) occupation: Tech-support, Craft-repair, Other-service, ... | |
8. relationship: Wife, Own-child, Husband, Not-in-family, | |
Other-relative, Unmarried. | |
9. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. | |
10. sex: Female, Male. | |
11. capital-gain: continuous. | |
12. capital-loss: continuous. | |
13. hours-per-week: continuous. | |
(14.) native-country: United-States, Cambodia, England, ... | |
Label: | |
>50K, <=50K. | |
''' | |
def to_float(i): | |
# to process continuous data including integer | |
return float(i) | |
def is_rich(labelstr): | |
# to process "Label" | |
if '>50K' in labelstr: | |
res = 1.0 | |
elif '<=50K' in labelstr: | |
res = 0.0 | |
else: | |
res = -1. # set error code | |
return res | |
def edu_type(edu_str): | |
# to process 'education' feature | |
edu_type_names = ['Bachelors', 'Some-college', '11th', 'HS-grad', | |
'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', | |
'12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', | |
'Preschool'] | |
edu_str = edu_str.strip(' ') | |
try: | |
res = edu_type_names.index(edu_str) | |
except: | |
res = -1 | |
res = float(res) | |
return res | |
def marital_status(mar_str): | |
# to process 'marial-status' feature | |
mar_type_names = ['Married-civ-spouse', 'Divorced', 'Never-married', | |
'Separated', 'Widowed', 'Married-spouse-absent', | |
'Married-AF-spouse'] | |
mar_str = mar_str.strip(' ') | |
try: | |
res = mar_type_names.index(mar_str) | |
except: | |
res = -1 | |
res = float(res) | |
return res | |
def to_fami_size(fami_str): | |
# to process 'relationship' feature | |
fami_str = fami_str.strip(' ') | |
if fami_str in ('Wife', 'Husband'): | |
res = 2.0 | |
elif fami_str in 'Own-child': | |
res = 4.0 | |
elif fami_str in 'Other-relative': | |
res = 2.0 | |
elif fami_str in ('Not-in-family', 'Unmarried'): | |
res = 1.0 | |
else: | |
res = -1. | |
return res | |
def race_type(race_str): | |
race_names = ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', | |
'Other', 'Black'] | |
race_str = race_str.strip(' ') | |
try: | |
res = race_names.index(race_str) | |
except: | |
res = -1 | |
res = float(res) | |
return res | |
def sex_type(sex_str): | |
sex_names = ['Female', 'Male'] | |
sex_str = sex_str.strip(' ') | |
try: | |
res = sex_names.index(sex_str) | |
except: | |
res = -1 | |
res = float(res) | |
return res | |
colnames = ['age', 'wc', 'flnwgt', 'educ', 'edu_num', 'marital', 'occup', | |
'relat', 'race', 'sex', 'cap_g', 'cap_l', 'hrs', 'native', | |
'incom'] | |
mydf = pd.read_csv(filename, header=None, names=colnames) | |
mydf.dropna(inplace=True) | |
mydf['adclass'] = mydf['incom'].apply(is_rich) | |
ymat = mydf['adclass'].values | |
xmat = np.zeros((len(ymat), 11)) | |
reindex_key = [0, -1, 1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1] | |
for i in range(len(colnames)-1): | |
ikey = reindex_key[i] | |
continuous_feat_list = ['age', 'flnwgt', 'edu_num', 'cap_g', | |
'cap_l', 'hrs' | |
] | |
if colnames[i] == 'wc': | |
pass | |
elif colnames[i] == 'educ': | |
xmat[:, ikey] = mydf['educ'].apply(edu_type) | |
elif colnames[i] == 'marital': | |
xmat[:, ikey] = mydf['marital'].apply(marital_status) | |
elif colnames[i] == 'occup': | |
pass | |
elif colnames[i] == 'relat': | |
xmat[:, ikey] = mydf['relat'].apply(to_fami_size) | |
elif colnames[i] == 'race': | |
xmat[:, ikey] = mydf['race'].apply(race_type) | |
elif colnames[i] == 'sex': | |
xmat[:, ikey] = mydf['sex'].apply(sex_type) | |
elif colnames[i] == 'native': | |
pass | |
elif colnames[i] in continuous_feat_list: | |
xmat[:, ikey] = mydf[(colnames[i])].values | |
return xmat, ymat | |
def setup_data(xmat, ymat): | |
# store the data into 'shared' variables to be accessible by Theano | |
def shared_dataset(xm, ym, borrow=True): | |
shared_x = theano.shared(np.asarray(xm, dtype=theano.config.floatX), | |
borrow=borrow) | |
shared_y = theano.shared(np.asarray(ym, dtype=theano.config.floatX), | |
borrow=borrow) | |
# | |
return shared_x, shared_y | |
def data_shuffle(xm, ym, siz): | |
idv = np.arange(siz) | |
idv0 = np.array(idv) # copy numbers | |
np.random.shuffle(idv) | |
xm[idv0] = xm[idv] | |
ym[idv0] = ym[idv] | |
x_new = np.zeros_like(xm) | |
y_new = np.zeros_like(ym) | |
x_new[idv0] = xm[idv] | |
y_new[idv0] = ym[idv] | |
return x_new, y_new | |
total_len = ymat.shape[0] | |
n_features = int(np.size(xmat) / total_len) | |
# Random Shuffle | |
xmat, ymat = data_shuffle(xmat, ymat, total_len) | |
xret, yret = shared_dataset(xmat.reshape((total_len, n_features)), ymat) | |
return xret, yret | |
# Hidden Layer | |
class HiddenLayer(object): | |
def __init__(self, input, n_in, n_out): | |
self.input = input | |
w_h = theano.shared(floatX(np.random.standard_normal([n_in, n_out])) | |
* 0.05) | |
b_h = theano.shared(floatX(np.zeros(n_out))) | |
self.w = w_h | |
self.b = b_h | |
self.params = [self.w, self.b] | |
def output(self): | |
linarg = T.dot(self.input, self.w) + self.b | |
self.output = T.nnet.sigmoid(linarg) | |
return self.output | |
# Read-out Layer | |
class ReadOutLayerBin(object): | |
def __init__(self, input, n_in, n_out): | |
self.input = input | |
w_o = theano.shared(floatX(np.random.standard_normal([n_in,n_out])) | |
* 0.05) | |
b_o = theano.shared(floatX(np.zeros(n_out))) | |
self.w = w_o | |
self.b = b_o | |
self.params = [self.w, self.b] | |
def output(self): | |
linarg = T.dot(self.input, self.w) + self.b | |
self.output = T.nnet.sigmoid(linarg) | |
return self.output | |
# Optimizers - GradientDescent, AdaGrad | |
class Optimizer(object): | |
def __init__(self, params, learning_rate=0.01): | |
self.lr = learning_rate | |
self.params = params | |
def minimize(self, loss): | |
self.gradparams = [T.grad(loss, param) for param in params] | |
def update_learning_rate(self, learning_rate): | |
self.lr = learning_rate | |
class GradientDescentOptimizer(Optimizer): | |
def __init__(self, params, learning_rate=0.01): | |
super(GradientDescentOptimizer, self).__init__(params, learning_rate) | |
def minimize(self, loss): | |
super(GradientDescentOptimizer, self).minimize(loss) | |
updates = [ | |
(param_i, param_i - self.lr * grad_i) | |
for param_i, grad_i in zip(self.params, self.gradparams) | |
] | |
return updates | |
def update_learning_rate(self, l_rate): | |
super(GradientDescentOptimizer, self).update_learning_rate(l_rate) | |
updates = [ | |
(param_i, param_i - self.lr * grad_i) | |
for param_i, grad_i in zip(self.params, self.gradparams) | |
] | |
return updates | |
if __name__ == '__main__': | |
np.random.seed(seed=20160204) | |
trX, trY = load_data('adult.data') | |
teX, teY = load_data('adult.test') | |
trXs, trYs = setup_data(trX, trY) | |
teXs, teYs = setup_data(teX, teY) | |
# Declare Theano symbolic variables | |
index = T.lscalar() | |
batch_size = T.lscalar() | |
learning_rate = T.scalar() | |
x = T.matrix('x') | |
y_ = T.vector('y') | |
# Define MLP network structure | |
h_layer1 = HiddenLayer(input=x, n_in=11, n_out=22) | |
h_layer2 = HiddenLayer(input=h_layer1.output(), n_in=22, n_out=20) | |
o_layer = ReadOutLayerBin(input=h_layer2.output(), n_in=20, n_out=1) | |
params = h_layer1.params + h_layer2.params + o_layer.params | |
# Cost Function basic term | |
hypo = (o_layer.output()).flatten() | |
prediction = hypo > 0.5 | |
iy_ = T.cast(y_, dtype='int32') | |
accur = T.mean(T.eq(prediction, iy_)) | |
xent = -y_ * T.log(hypo) - (1-y_) * T.log(1-hypo) | |
# Regularization terms (weight decay) | |
L2_sqr = ((h_layer1.w **2).sum() | |
+ (h_layer2.w **2).sum() | |
+ (o_layer.w **2).sum()) | |
cost = xent.mean() + 0.01 * L2_sqr | |
# Train | |
myoptimizer = GradientDescentOptimizer(params, learning_rate=0.01) | |
one_update = myoptimizer.minimize(cost) | |
############################################# | |
batch_size = 50 | |
############################################# | |
# Compile | |
train_model = theano.function( | |
inputs=[index], | |
outputs=[cost, accur], | |
updates=one_update, | |
givens=[(x, trXs[index * batch_size:(index + 1) * batch_size]), | |
(y_, trYs[index * batch_size:(index + 1) * batch_size])], | |
allow_input_downcast=True | |
) | |
accuracy = theano.function( | |
inputs=[], | |
outputs=accur, | |
givens=[(x, teXs), (y_, teYs)], | |
allow_input_downcast=True | |
) | |
# Train (Optimization) | |
start_time = timeit.default_timer() | |
n_epochs = 50 | |
epoch = 0 | |
n_train_batches = int(trY.shape[0] / batch_size) | |
while (epoch < n_epochs): | |
epoch += 1 | |
for mini_batch_index in range(n_train_batches): | |
cost_j, accur = train_model(mini_batch_index) | |
print('epoch[%3d] : cost =%8.4f' % (epoch, cost_j)) | |
elapsed_time = timeit.default_timer() - start_time | |
print('Elapsed time: %10.3f [s]' % elapsed_time) | |
last_accur = accuracy() | |
print('Accuracy = %10.3f ' % last_accur) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment