stormxuwz · September 20, 2016 04:03
diff --git a/1 hidden layer SGD neural network.py b/1 hidden layer SGD neural network.py
 from __future__ import division
 import numpy as np
 import chainer  # only for import the data at this script


 def softmax(z):
 	# z is a vector
    return np.exp(z) / np.sum(np.exp(z))

 def sigmoid(x):
 	# x can be a vector
 	return 1.0/(1.0+np.exp(-x))

 def sigmoid_gradient(x):
 	# x can be a vector
 	return sigmoid(x)*(1-sigmoid(x))

 def ReLU(x):
 	# x can be a vector
 	return np.maximum(x,0)

 def ReLU_gradient(x):
 	# x can be a vector
 	return 1.0*(x>0)

 class myNeuralNetwork(object):

 	def __init__(self,unitsPerHiddenLayer = [3], outputFunc = "softmax",hiddenLayerFunc = "ReLU",learningRate = 0.1,epochs = 3):
 		'''
 		# Currently, only single hidden layer is implemented
 		'''
 		self.hiddenLayerNum = len(unitsPerHiddenLayer)
 		self.unitsPerHiddenLayer = unitsPerHiddenLayer

 		if outputFunc == "softmax":
 			self.outputFunc = softmax
 		else:
 			raise ValueError("not implemented")

 		if hiddenLayerFunc == "sigmoid":
 			self.hiddenLayerFunc = sigmoid
 			self.hiddenLayerFunc_gradient = sigmoid_gradient

 		elif hiddenLayerFunc == "ReLU":
 			self.hiddenLayerFunc = ReLU
 			self.hiddenLayerFunc_gradient = ReLU_gradient
 		else:
 			raise ValueError("not Implemented")

 		self.LR = learningRate
 		self.epochs = epochs

 		self.w = []  # w[0] and b[0] are used from the data(a[0]) to z[0], w[1] and b[1] are used from a[1] to z[1]
 		self.b = []
 		self.a = []  # a[0] is the data or thr output of the data layers, a[1] is the output of the first layer
 		self.z = []  # z[0] is the input of the first hidden layers


 	def fit(self,X,Y):
 		d = X.shape[1] # data feature dimension
 		n = X.shape[0] # number of the data samples
 		k = len(set(Y)) # number of classes
 		
 		# print d,n,k

 		# initialize the parameter

 		# initialize the input layers
 		self.a.append(np.zeros(d))

 		# initialize the hidden layers
 		for i in range(self.hiddenLayerNum):
 			L = self.unitsPerHiddenLayer[i]
 			self.w.append((np.random.rand(L,d)-0.5)/3.0) # -0.5 to reduce the initial weights to [-0.5,0.5], /3 to reduce to a even smaller weights
 			self.b.append((np.random.rand(L)-0.5)/3.0)

 			self.z.append(np.zeros((L,1)))
 			self.a.append(np.zeros((L,1)))

 		# initialize the output layers
 		self.w.append((np.random.rand(k,L)-0.5)/3.0)
 		self.b.append((np.random.rand(k)-0.5)/3.0)
 		self.z.append(np.zeros(k))

 		# start fitting using SGD
 		for e in range(self.epochs):
 			# print "epochs:",e
 			sampleIndex = np.random.choice(n,n,replace = False) # generate random numbers
 			
 			for train_iter in sampleIndex:
 				j = train_iter

 				f = self.feedForward(X[j])
 				delta = self.backPropagation(f,X[j],Y[j])

 				# print "finish delta"
 				# update the parameters

 				for i in range(self.hiddenLayerNum+1): # +1 to update the output layers
 					self.w[i] += self.LR*delta["w"][i]
 					self.b[i] += self.LR*delta["b"][i]



 	def feedForward(self,x):
 		# feedForward function to evaluate
 		self.a[0] = x.T
 		
 		for i in range(self.hiddenLayerNum):
 			self.z[i] = np.dot(self.w[i], self.a[i]) + self.b[i]  # w[0] shape: (L,d); a[0] shape: (d,1) for the first hidden layer
 			self.a[i+1] = self.hiddenLayerFunc(self.z[i])

 		self.z[-1] = np.dot(self.w[-1],self.a[-1]) + self.b[-1] # hidden layer output to the output layers

 		output = self.outputFunc(self.z[-1]) # output the probability of each class

 		return output # probability of each class

 	def backPropagation(self,f,X,y):
 		'''
 			Back-Propagation algorithm to fit
 			with stochastic gradient descent
 			currently only fit for the one hidden layers
 			# costF is an array of the output of the softmax function
 			# x is the sample
 			# y is the label from 0,1,2,..k
 		'''
 		
 		delta3 = -f  # probability, shape (k,)
 		delta3[y] = delta3[y]+1
 		deltaW2 = np.dot(delta3.reshape(-1,1),self.a[1].reshape(1,-1)) # shape (k,1) dot shape(1,L)
 		deltab2 = delta3

 		# print "delta3.shape",delta3.shape
 		# print "deltaW2",deltaW2.shape
 		# print self.z[0]
 		# print self.w[1]
 		# print np.dot(self.w[1],delta3)

 		delta2 = np.dot(self.w[1].T,delta3) * self.hiddenLayerFunc_gradient(self.z[0])  # shape (L,)
 		
 		# print "delta2.shape",delta2.shape

 		deltaW1 = np.dot(delta2.reshape(-1,1),X.reshape(1,-1))
 		deltab1 = delta2

 		return {"w":[deltaW1,deltaW2],"b":[deltab1,deltab2]}

 	def predict(self,X):
 		return np.array([np.argmax(self.feedForward(sample)) for sample in X])


 	def performanceEval(self,testX,testY):
 		return sum(self.predict(testX) == testY)/len(testY)


 def test(uN,f,LR,epochs):
 	print "start"
 	nn = myNeuralNetwork(unitsPerHiddenLayer = [uN],hiddenLayerFunc = f, learningRate = LR,epochs=epochs)
 	train, test = chainer.datasets.get_mnist()
 	# train[i][0] is the image, train[i][1] is the label
 	
 	trainX = np.array([sample[0].flatten() for sample in train])
 	trainY = np.array([sample[1] for sample in train])

 	testX = np.array([sample[0].flatten() for sample in test])
 	testY = np.array([sample[1] for sample in test])

 	nn.fit(X = trainX, Y = trainY)
 	print "units Num: %d, hidden layer func: %s, learning rate: %f, epochs: %d, accuracy: %f" %(uN,f,LR,epochs,nn.performanceEval(testX, testY))

 def main():
 	# test(50, "sigmoid", 0.07, 1)
 	# test(50, "sigmoid", 0.07, 5)

 	# test(100, "sigmoid", 0.07, 1)
 	# test(100, "sigmoid", 0.07, 5)

 	# test(200, "sigmoid", 0.07, 1)
 	# test(200, "sigmoid", 0.07, 5)

 	# test(50, "ReLU", 0.07, 1)
 	# test(50, "ReLU", 0.07, 5)

 	# test(100, "ReLU", 0.07, 1)
 	# test(100, "ReLU", 0.07, 5)

 	# test(200, "ReLU", 0.07, 1)
 	# test(200, "ReLU", 0.07, 5)

 	#test(50, "sigmoid", 0.05, 5)
 	#test(50, "sigmoid", 0.1, 5)
 	#test(50, "sigmoid", 0.5, 5)

 	#test(50, "ReLU", 0.05, 5)
 	#test(50, "ReLU", 0.1, 5)
 	# test(50, "ReLU", 0.5, 5)
 	# test(50, "ReLU", 0.01, 5)
 	test(50, "sigmoid", 0.01, 5)
 	test(10, "sigmoid", 0.01, 5)


 if __name__ == '__main__':
 	main()
	from __future__ import division
	import numpy as np
	import chainer # only for import the data at this script


	def softmax(z):
	# z is a vector
	return np.exp(z) / np.sum(np.exp(z))

	def sigmoid(x):
	# x can be a vector
	return 1.0/(1.0+np.exp(-x))

	def sigmoid_gradient(x):
	# x can be a vector
	return sigmoid(x)*(1-sigmoid(x))

	def ReLU(x):
	# x can be a vector
	return np.maximum(x,0)

	def ReLU_gradient(x):
	# x can be a vector
	return 1.0*(x>0)

	class myNeuralNetwork(object):

	def __init__(self,unitsPerHiddenLayer = [3], outputFunc = "softmax",hiddenLayerFunc = "ReLU",learningRate = 0.1,epochs = 3):
	'''
	# Currently, only single hidden layer is implemented
	'''
	self.hiddenLayerNum = len(unitsPerHiddenLayer)
	self.unitsPerHiddenLayer = unitsPerHiddenLayer

	if outputFunc == "softmax":
	self.outputFunc = softmax
	else:
	raise ValueError("not implemented")

	if hiddenLayerFunc == "sigmoid":
	self.hiddenLayerFunc = sigmoid
	self.hiddenLayerFunc_gradient = sigmoid_gradient

	elif hiddenLayerFunc == "ReLU":
	self.hiddenLayerFunc = ReLU
	self.hiddenLayerFunc_gradient = ReLU_gradient
	else:
	raise ValueError("not Implemented")

	self.LR = learningRate
	self.epochs = epochs

	self.w = [] # w[0] and b[0] are used from the data(a[0]) to z[0], w[1] and b[1] are used from a[1] to z[1]
	self.b = []
	self.a = [] # a[0] is the data or thr output of the data layers, a[1] is the output of the first layer
	self.z = [] # z[0] is the input of the first hidden layers


	def fit(self,X,Y):
	d = X.shape[1] # data feature dimension
	n = X.shape[0] # number of the data samples
	k = len(set(Y)) # number of classes

	# print d,n,k

	# initialize the parameter

	# initialize the input layers
	self.a.append(np.zeros(d))

	# initialize the hidden layers
	for i in range(self.hiddenLayerNum):
	L = self.unitsPerHiddenLayer[i]
	self.w.append((np.random.rand(L,d)-0.5)/3.0) # -0.5 to reduce the initial weights to [-0.5,0.5], /3 to reduce to a even smaller weights
	self.b.append((np.random.rand(L)-0.5)/3.0)

	self.z.append(np.zeros((L,1)))
	self.a.append(np.zeros((L,1)))

	# initialize the output layers
	self.w.append((np.random.rand(k,L)-0.5)/3.0)
	self.b.append((np.random.rand(k)-0.5)/3.0)
	self.z.append(np.zeros(k))

	# start fitting using SGD
	for e in range(self.epochs):
	# print "epochs:",e
	sampleIndex = np.random.choice(n,n,replace = False) # generate random numbers

	for train_iter in sampleIndex:
	j = train_iter

	f = self.feedForward(X[j])
	delta = self.backPropagation(f,X[j],Y[j])

	# print "finish delta"
	# update the parameters

	for i in range(self.hiddenLayerNum+1): # +1 to update the output layers
	self.w[i] += self.LR*delta["w"][i]
	self.b[i] += self.LR*delta["b"][i]



	def feedForward(self,x):
	# feedForward function to evaluate
	self.a[0] = x.T

	for i in range(self.hiddenLayerNum):
	self.z[i] = np.dot(self.w[i], self.a[i]) + self.b[i] # w[0] shape: (L,d); a[0] shape: (d,1) for the first hidden layer
	self.a[i+1] = self.hiddenLayerFunc(self.z[i])

	self.z[-1] = np.dot(self.w[-1],self.a[-1]) + self.b[-1] # hidden layer output to the output layers

	output = self.outputFunc(self.z[-1]) # output the probability of each class

	return output # probability of each class

	def backPropagation(self,f,X,y):
	'''
	Back-Propagation algorithm to fit
	with stochastic gradient descent
	currently only fit for the one hidden layers
	# costF is an array of the output of the softmax function
	# x is the sample
	# y is the label from 0,1,2,..k
	'''

	delta3 = -f # probability, shape (k,)
	delta3[y] = delta3[y]+1
	deltaW2 = np.dot(delta3.reshape(-1,1),self.a[1].reshape(1,-1)) # shape (k,1) dot shape(1,L)
	deltab2 = delta3

	# print "delta3.shape",delta3.shape
	# print "deltaW2",deltaW2.shape
	# print self.z[0]
	# print self.w[1]
	# print np.dot(self.w[1],delta3)

	delta2 = np.dot(self.w[1].T,delta3) * self.hiddenLayerFunc_gradient(self.z[0]) # shape (L,)

	# print "delta2.shape",delta2.shape

	deltaW1 = np.dot(delta2.reshape(-1,1),X.reshape(1,-1))
	deltab1 = delta2

	return {"w":[deltaW1,deltaW2],"b":[deltab1,deltab2]}

	def predict(self,X):
	return np.array([np.argmax(self.feedForward(sample)) for sample in X])


	def performanceEval(self,testX,testY):
	return sum(self.predict(testX) == testY)/len(testY)


	def test(uN,f,LR,epochs):
	print "start"
	nn = myNeuralNetwork(unitsPerHiddenLayer = [uN],hiddenLayerFunc = f, learningRate = LR,epochs=epochs)
	train, test = chainer.datasets.get_mnist()
	# train[i][0] is the image, train[i][1] is the label

	trainX = np.array([sample[0].flatten() for sample in train])
	trainY = np.array([sample[1] for sample in train])

	testX = np.array([sample[0].flatten() for sample in test])
	testY = np.array([sample[1] for sample in test])

	nn.fit(X = trainX, Y = trainY)
	print "units Num: %d, hidden layer func: %s, learning rate: %f, epochs: %d, accuracy: %f" %(uN,f,LR,epochs,nn.performanceEval(testX, testY))

	def main():
	# test(50, "sigmoid", 0.07, 1)
	# test(50, "sigmoid", 0.07, 5)

	# test(100, "sigmoid", 0.07, 1)
	# test(100, "sigmoid", 0.07, 5)

	# test(200, "sigmoid", 0.07, 1)
	# test(200, "sigmoid", 0.07, 5)

	# test(50, "ReLU", 0.07, 1)
	# test(50, "ReLU", 0.07, 5)

	# test(100, "ReLU", 0.07, 1)
	# test(100, "ReLU", 0.07, 5)

	# test(200, "ReLU", 0.07, 1)
	# test(200, "ReLU", 0.07, 5)

	#test(50, "sigmoid", 0.05, 5)
	#test(50, "sigmoid", 0.1, 5)
	#test(50, "sigmoid", 0.5, 5)

	#test(50, "ReLU", 0.05, 5)
	#test(50, "ReLU", 0.1, 5)
	# test(50, "ReLU", 0.5, 5)
	# test(50, "ReLU", 0.01, 5)
	test(50, "sigmoid", 0.01, 5)
	test(10, "sigmoid", 0.01, 5)


	if __name__ == '__main__':
	main()