Skip to content

Instantly share code, notes, and snippets.

@tomokishii
Last active February 1, 2018 18:23
Show Gist options
  • Save tomokishii/dac0132bcc9b5b732d41 to your computer and use it in GitHub Desktop.
Save tomokishii/dac0132bcc9b5b732d41 to your computer and use it in GitHub Desktop.
Adult Dataset (from UCI Machine Learning Repository) classification, 1. Logistic Regression, 2. MLP (Multi-layer Perceptron) model
#
# adult_LR_classifier.py date. 10/17/2015
# SGD (Stochastic Gradient Descent) version
# consider 3 features of dataset
#
import numpy as np
import numpy.random as rng
import matplotlib.pyplot as plt
import scipy as sp
import scipy.optimize as spo
import pandas as pd
import timeit
import theano
import theano.tensor as T
def load_data():
def to_float(i):
return float(i)
def is_rich(labelstr):
if '>50K' in labelstr:
res = 1.0
elif '<=50K' in labelstr:
res = 0.0
else:
res = -1.0
return res
def to_fami_size(fami_str):
fami_str = str.lstrip(fami_str)
if fami_str in ('Wife', 'Husband'):
res = 2.0
elif fami_str in 'Own-child':
res = 4.0
elif fami_str in 'Other-relative':
res = 2.0
elif fami_str in ('Not-in-family', 'Unmarried'):
res = 1.0
else:
res = 0.0
return res
colnames = ['age', 'wc', 'dmy1', 'educ', 'edu_num', 'marital', 'occup',
'relat', 'race', 'sex', 'cap_g', 'cap_l', 'hrs', 'native',
'incom']
mydf = pd.read_csv('adult.data', header=None, names=colnames)
mydf.dropna(inplace=True)
xmat = np.column_stack((mydf['edu_num'].apply(to_float).values,
mydf['relat'].apply(to_fami_size),
mydf['hrs'].apply(to_float).values))
ymat = mydf['incom'].apply(is_rich).values
return xmat, ymat # shape: xmat [m, ], ymat [m, ]
def setup_data(xmat, ymat):
# store the data into 'shared' variables to be accessible by Theano
def shared_dataset(xm, ym, borrow=True):
shared_x = theano.shared(np.asarray(xm, dtype=theano.config.floatX),
borrow=borrow)
shared_y = theano.shared(np.asarray(ym, dtype=theano.config.floatX),
borrow=borrow)
#
return shared_x, shared_y
def data_shuffle(xm, ym, siz):
idv = np.arange(siz)
idv0 = np.array(idv) # copy numbers
np.random.shuffle(idv)
xm[idv0] = xm[idv]
ym[idv0] = ym[idv]
x_new = np.zeros_like(xm)
y_new = np.zeros_like(ym)
x_new[idv0] = xm[idv]
y_new[idv0] = ym[idv]
return x_new, y_new
total_len = ymat.shape[0]
n_features = np.size(xmat) / total_len
# Random Shuffle
xmat, ymat = data_shuffle(xmat, ymat, total_len)
train_len = int(total_len * 0.7)
test_len = total_len - train_len
xtr, ytr = shared_dataset(
(xmat[:train_len]).reshape(train_len, n_features),
ymat[:train_len])
xte, yte = shared_dataset(
(xmat[train_len:]).reshape(test_len, n_features),
ymat[train_len:])
rval = [(xtr, ytr), (xte, yte)]
return rval
if __name__ == "__main__":
np.random.seed(20151017)
xmat, ymat = load_data()
datasets = setup_data(xmat, ymat)
xtr, ytr = datasets[0] # xtr, ytr are vector
xte, yte = datasets[1] # xte, yte are vector
# Declare Theano symbolic variables
xtr_nrow, xtr_ncol = (xtr.get_value()).shape
index = T.lscalar() # index to a [mini]batch
batch_size = T.lscalar()
learning_rate = T.scalar()
x = T.matrix('x')
y = T.vector('y')
w = theano.shared(np.zeros(xtr_ncol), name='w') # w, b <- all zero
b = theano.shared(0., name='b')
print ' Initial model: '
wi = w.get_value()
bi = w.get_value()
np.set_printoptions(precision=4)
print 'w : ', wi, 'b : ', bi
myp = T.nnet.sigmoid(T.dot(x, w) + b)
prediction = myp > 0.5 # from theano tutorial
xent = T.nnet.binary_crossentropy(myp, y)
cost = xent.mean() + 0.01 * (w ** 2).sum() # regularization
gw, gb = T.grad(cost, [w, b])
#############################################
batch_size = 50
#############################################
# Compile
train_model = theano.function(
inputs=[index, learning_rate],
outputs=[cost, prediction],
updates=((w, w - learning_rate * gw), (b, b - learning_rate * gb)),
givens=[(x, xtr[index * batch_size:(index + 1) * batch_size]),
(y, ytr[index * batch_size:(index + 1) * batch_size])],
allow_input_downcast=True
)
predict = theano.function(
# inputs=[x],
inputs=[],
outputs=prediction,
givens=[(x, xte)],
allow_input_downcast=True
)
# Train (Optimization)
start_time = timeit.default_timer()
n_epochs = 50
epoch = 0
lrate_base = 0.01
lrate_coef = 20
n_train_batches = int(ytr.get_value(borrow=True).shape[0] / batch_size)
while (epoch < n_epochs):
epoch += 1
for mini_batch_index in range(n_train_batches):
l_rate = lrate_base * lrate_coef / (epoch + lrate_coef)
cost_j, pred = train_model(mini_batch_index, l_rate)
print 'epoch[%3d] : cost =%f ' % (epoch, cost_j)
# Print result
print '\n Final model: '
wf = w.get_value()
bf = b.get_value()
np.set_printoptions(precision=4)
print 'w : ', wf, 'b : ', bf
print 'Elapsed time: %10.3f [s]' % (timeit.default_timer() - start_time)
mypred = (predict()).flatten()
iv_yte = (yte.get_value()).astype(int)
accu = (mypred==iv_yte).astype(int)
accu = accu.sum() *1.0 / iv_yte.shape[0]
print 'accuracy = %12.4f ' % accu
#
# adult_data_classifier.py date. 2/4/2016
#
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import numpy as np
import numpy.random as rng
# import matplotlib.pyplot as plt
import pandas as pd
import timeit
import theano
import theano.tensor as T
def floatX(X):
return np.asarray(X, dtype=theano.config.floatX)
def load_data(filename='adult.data'):
'''
Load "Adult" data set. It has 13 features and 1 label.
Features:
1. age: continuous.
(2.) workclass: Private, Self-emp-not-inc, Self-emp-inc, ...
3. fnlwgt: continuous.
4. education: Bachelors, Some-college, 11th, HS-grad, Prof-school,
Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th,
Doctorate, 5th-6th, Preschool.
5. education-num: continuous.
6. marital-status: Married-civ-spouse, Divorced, Never-married,
Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
(7.) occupation: Tech-support, Craft-repair, Other-service, ...
8. relationship: Wife, Own-child, Husband, Not-in-family,
Other-relative, Unmarried.
9. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
10. sex: Female, Male.
11. capital-gain: continuous.
12. capital-loss: continuous.
13. hours-per-week: continuous.
(14.) native-country: United-States, Cambodia, England, ...
Label:
>50K, <=50K.
'''
def to_float(i):
# to process continuous data including integer
return float(i)
def is_rich(labelstr):
# to process "Label"
if '>50K' in labelstr:
res = 1.0
elif '<=50K' in labelstr:
res = 0.0
else:
res = -1. # set error code
return res
def edu_type(edu_str):
# to process 'education' feature
edu_type_names = ['Bachelors', 'Some-college', '11th', 'HS-grad',
'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th',
'12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th',
'Preschool']
edu_str = edu_str.strip(' ')
try:
res = edu_type_names.index(edu_str)
except:
res = -1
res = float(res)
return res
def marital_status(mar_str):
# to process 'marial-status' feature
mar_type_names = ['Married-civ-spouse', 'Divorced', 'Never-married',
'Separated', 'Widowed', 'Married-spouse-absent',
'Married-AF-spouse']
mar_str = mar_str.strip(' ')
try:
res = mar_type_names.index(mar_str)
except:
res = -1
res = float(res)
return res
def to_fami_size(fami_str):
# to process 'relationship' feature
fami_str = fami_str.strip(' ')
if fami_str in ('Wife', 'Husband'):
res = 2.0
elif fami_str in 'Own-child':
res = 4.0
elif fami_str in 'Other-relative':
res = 2.0
elif fami_str in ('Not-in-family', 'Unmarried'):
res = 1.0
else:
res = -1.
return res
def race_type(race_str):
race_names = ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo',
'Other', 'Black']
race_str = race_str.strip(' ')
try:
res = race_names.index(race_str)
except:
res = -1
res = float(res)
return res
def sex_type(sex_str):
sex_names = ['Female', 'Male']
sex_str = sex_str.strip(' ')
try:
res = sex_names.index(sex_str)
except:
res = -1
res = float(res)
return res
colnames = ['age', 'wc', 'flnwgt', 'educ', 'edu_num', 'marital', 'occup',
'relat', 'race', 'sex', 'cap_g', 'cap_l', 'hrs', 'native',
'incom']
mydf = pd.read_csv(filename, header=None, names=colnames)
mydf.dropna(inplace=True)
mydf['adclass'] = mydf['incom'].apply(is_rich)
ymat = mydf['adclass'].values
xmat = np.zeros((len(ymat), 11))
reindex_key = [0, -1, 1, 2, 3, 4, -1, 5, 6, 7, 8, 9, 10, -1]
for i in range(len(colnames)-1):
ikey = reindex_key[i]
continuous_feat_list = ['age', 'flnwgt', 'edu_num', 'cap_g',
'cap_l', 'hrs'
]
if colnames[i] == 'wc':
pass
elif colnames[i] == 'educ':
xmat[:, ikey] = mydf['educ'].apply(edu_type)
elif colnames[i] == 'marital':
xmat[:, ikey] = mydf['marital'].apply(marital_status)
elif colnames[i] == 'occup':
pass
elif colnames[i] == 'relat':
xmat[:, ikey] = mydf['relat'].apply(to_fami_size)
elif colnames[i] == 'race':
xmat[:, ikey] = mydf['race'].apply(race_type)
elif colnames[i] == 'sex':
xmat[:, ikey] = mydf['sex'].apply(sex_type)
elif colnames[i] == 'native':
pass
elif colnames[i] in continuous_feat_list:
xmat[:, ikey] = mydf[(colnames[i])].values
return xmat, ymat
def setup_data(xmat, ymat):
# store the data into 'shared' variables to be accessible by Theano
def shared_dataset(xm, ym, borrow=True):
shared_x = theano.shared(np.asarray(xm, dtype=theano.config.floatX),
borrow=borrow)
shared_y = theano.shared(np.asarray(ym, dtype=theano.config.floatX),
borrow=borrow)
#
return shared_x, shared_y
def data_shuffle(xm, ym, siz):
idv = np.arange(siz)
idv0 = np.array(idv) # copy numbers
np.random.shuffle(idv)
xm[idv0] = xm[idv]
ym[idv0] = ym[idv]
x_new = np.zeros_like(xm)
y_new = np.zeros_like(ym)
x_new[idv0] = xm[idv]
y_new[idv0] = ym[idv]
return x_new, y_new
total_len = ymat.shape[0]
n_features = int(np.size(xmat) / total_len)
# Random Shuffle
xmat, ymat = data_shuffle(xmat, ymat, total_len)
xret, yret = shared_dataset(xmat.reshape((total_len, n_features)), ymat)
return xret, yret
# Hidden Layer
class HiddenLayer(object):
def __init__(self, input, n_in, n_out):
self.input = input
w_h = theano.shared(floatX(np.random.standard_normal([n_in, n_out]))
* 0.05)
b_h = theano.shared(floatX(np.zeros(n_out)))
self.w = w_h
self.b = b_h
self.params = [self.w, self.b]
def output(self):
linarg = T.dot(self.input, self.w) + self.b
self.output = T.nnet.sigmoid(linarg)
return self.output
# Read-out Layer
class ReadOutLayerBin(object):
def __init__(self, input, n_in, n_out):
self.input = input
w_o = theano.shared(floatX(np.random.standard_normal([n_in,n_out]))
* 0.05)
b_o = theano.shared(floatX(np.zeros(n_out)))
self.w = w_o
self.b = b_o
self.params = [self.w, self.b]
def output(self):
linarg = T.dot(self.input, self.w) + self.b
self.output = T.nnet.sigmoid(linarg)
return self.output
# Optimizers - GradientDescent, AdaGrad
class Optimizer(object):
def __init__(self, params, learning_rate=0.01):
self.lr = learning_rate
self.params = params
def minimize(self, loss):
self.gradparams = [T.grad(loss, param) for param in params]
def update_learning_rate(self, learning_rate):
self.lr = learning_rate
class GradientDescentOptimizer(Optimizer):
def __init__(self, params, learning_rate=0.01):
super(GradientDescentOptimizer, self).__init__(params, learning_rate)
def minimize(self, loss):
super(GradientDescentOptimizer, self).minimize(loss)
updates = [
(param_i, param_i - self.lr * grad_i)
for param_i, grad_i in zip(self.params, self.gradparams)
]
return updates
def update_learning_rate(self, l_rate):
super(GradientDescentOptimizer, self).update_learning_rate(l_rate)
updates = [
(param_i, param_i - self.lr * grad_i)
for param_i, grad_i in zip(self.params, self.gradparams)
]
return updates
if __name__ == '__main__':
np.random.seed(seed=20160204)
trX, trY = load_data('adult.data')
teX, teY = load_data('adult.test')
trXs, trYs = setup_data(trX, trY)
teXs, teYs = setup_data(teX, teY)
# Declare Theano symbolic variables
index = T.lscalar()
batch_size = T.lscalar()
learning_rate = T.scalar()
x = T.matrix('x')
y_ = T.vector('y')
# Define MLP network structure
h_layer1 = HiddenLayer(input=x, n_in=11, n_out=22)
h_layer2 = HiddenLayer(input=h_layer1.output(), n_in=22, n_out=20)
o_layer = ReadOutLayerBin(input=h_layer2.output(), n_in=20, n_out=1)
params = h_layer1.params + h_layer2.params + o_layer.params
# Cost Function basic term
hypo = (o_layer.output()).flatten()
prediction = hypo > 0.5
iy_ = T.cast(y_, dtype='int32')
accur = T.mean(T.eq(prediction, iy_))
xent = -y_ * T.log(hypo) - (1-y_) * T.log(1-hypo)
# Regularization terms (weight decay)
L2_sqr = ((h_layer1.w **2).sum()
+ (h_layer2.w **2).sum()
+ (o_layer.w **2).sum())
cost = xent.mean() + 0.01 * L2_sqr
# Train
myoptimizer = GradientDescentOptimizer(params, learning_rate=0.01)
one_update = myoptimizer.minimize(cost)
#############################################
batch_size = 50
#############################################
# Compile
train_model = theano.function(
inputs=[index],
outputs=[cost, accur],
updates=one_update,
givens=[(x, trXs[index * batch_size:(index + 1) * batch_size]),
(y_, trYs[index * batch_size:(index + 1) * batch_size])],
allow_input_downcast=True
)
accuracy = theano.function(
inputs=[],
outputs=accur,
givens=[(x, teXs), (y_, teYs)],
allow_input_downcast=True
)
# Train (Optimization)
start_time = timeit.default_timer()
n_epochs = 50
epoch = 0
n_train_batches = int(trY.shape[0] / batch_size)
while (epoch < n_epochs):
epoch += 1
for mini_batch_index in range(n_train_batches):
cost_j, accur = train_model(mini_batch_index)
print('epoch[%3d] : cost =%8.4f' % (epoch, cost_j))
elapsed_time = timeit.default_timer() - start_time
print('Elapsed time: %10.3f [s]' % elapsed_time)
last_accur = accuracy()
print('Accuracy = %10.3f ' % last_accur)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment