pannous · July 30, 2017 16:15 · pannous · Sep 4, 2016
diff --git a/tensorflow-densenet.py b/tensorflow-densenet.py
 #!/usr/bin/python
 from __future__ import print_function
 import os
 import numpy as np
 import tensorflow as tf
 from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets

 # mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
 mnist = read_data_sets("/tmp/data/", one_hot=True)

 force_gpu = False
 debug = False # histogram_summary ...
 # _cpu='/cpu:0'
 default_learning_rate=0.01
 decay_step = 3000
 decay_size = 0.95
 # dropout = 0.6
 dropout = None #None to disable dropout, non - zero number to enable dropout and set keep rate

 batch_size=64
 _cpu='/cpu:0'
 tensorboard_logs = '/tmp/tensorboard-logs/'

 # $(sleep 5; open http://0.0.0.0:6006) & tensorboard --debug --logdir=/tmp/tensorboard-logs/
 class net():

 	def input_width(self,data):
 		return 28*28

 	def __init__(self,model,data,name=0,learning_rate=default_learning_rate,batch_size=batch_size):
 		# device = '/GPU:0' if gpu else '/cpu:0'
 		# device =None # auto
 		# print("Using device ",device)
 		# with tf.device(device):
 		if True:
 			self.session=sess=session=tf.Session()
 			# self.session=sess=session=tf.Session(config=tf.ConfigProto(log_device_placement=True))
 			self.model=model
 			self.data=data # assigned to self.x=net.input via train
 			self.batch_size=batch_size
 			self.layers=[]
 			self.last_width=self.input_width(data)
 			self.learning_rate=learning_rate

 			# if not name: name=model.__name__
 			# if name and os.path.exists(name):
 			# 	return self.load_model(name)
 			self.generate_model(model)

 	def generate_model(self,model, name=''):
 		if not model: return self
 		with tf.name_scope('state'):
 			self.keep_prob = tf.placeholder(tf.float32)  # 1 for testing! else 1 - dropout
 			self.train_phase = tf.placeholder(tf.bool, name='train_phase')
 			self.global_step = tf.Variable(0)  # dont set, feed or increment global_step, tensorflow will do it automatically
 		with tf.name_scope('data'):
 			n_input=28*28
 			n_classes=10
 			self.x = x = self.input  = tf.placeholder(tf.float32, [None, n_input])
 			self.last_layer=x
 			self.y = y = self.target = tf.placeholder(tf.float32, [None, n_classes])
 			if not force_gpu: tf.image_summary("mnist", tf.reshape(self.x, [-1, 28, 28, 1], "mnist_images"))
 		with tf.name_scope('model'):
 			model(self)
 		if(self.last_width!=n_classes): self.classifier()  # 10 classes auto


 	def add(self, layer):
 		self.layers.append(layer)
 		self.last_layer = layer
 		self.last_shape = layer.get_shape()

 	def reshape(self,shape):
 		self.last_layer = tf.reshape(self.last_layer,shape)
 		self.last_shape = shape
 		self.last_width = shape[-1]

 	def batchnorm(self):
 		from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
 		with tf.name_scope('batchnorm') as scope:
 			input = self.last_layer
 			# mean, var = tf.nn.moments(input, axes=[0, 1, 2])
 			# self.batch_norm = tf.nn.batch_normalization(input, mean, var, offset=1, scale=1, variance_epsilon=1e-6)
 			# self.last_layer=self.batch_norm
 			train_op=batch_norm(input, is_training=True, center=False, updates_collections=None, scope=scope)
 			test_op=batch_norm(input, is_training=False, updates_collections=None, center=False,scope=scope, reuse=True)
 			self.add(tf.cond(self.train_phase,lambda:train_op,lambda:test_op))

 	# Fully connected layer
 	def dense(self, hidden=1024, depth=1, act=tf.nn.tanh, dropout=False, parent=-1): #
 		if parent==-1: parent=self.last_layer
 		shape = self.last_layer.get_shape()
 		if shape and len(shape)>2:
 			self.last_width= int(shape[1]*shape[2]*shape[3])
 			print("reshapeing ",shape,"to",self.last_width)
 			parent = tf.reshape(parent, [-1, self.last_width])

 		width = hidden
 		while depth>0:
 			with tf.name_scope('Dense_{:d}'.format(hidden)) as scope:
 				print("Dense ", self.last_width, width)
 				nr = len(self.layers)
 				if self.last_width == width:
 					U = closest_unitary(np.random.rand(self.last_width, width) / (self.last_width + width))
 					weights = tf.Variable(U, name="weights_dense_" + str(nr))
 				else:
 					weights = tf.Variable(tf.random_uniform([self.last_width, width], minval=-1. / width, maxval=1. / width), name="weights_dense")
 				bias = tf.Variable(tf.random_uniform([width],minval=-1./width,maxval=1./width), name="bias_dense")
 				dense1 = tf.matmul(parent, weights, name='dense_'+str(nr))+ bias
 				tf.histogram_summary('dense_'+str(nr),dense1)
 				tf.histogram_summary('weights_'+str(nr),weights)
 				tf.histogram_summary('bias_'+str(nr),bias)
 				tf.histogram_summary('dense_'+str(nr)+'/sparsity', tf.nn.zero_fraction(dense1))
 				tf.histogram_summary('weights_'+str(nr)+'/sparsity', tf.nn.zero_fraction(weights))
 				if act: dense1 = act(dense1)
 				# if norm: dense1 = self.norm(dense1,lsize=1) # SHAPE!
 				if dropout: dense1 = tf.nn.dropout(dense1, self.keep_prob)
 				self.layers.append(dense1)
 				self.last_layer = parent = dense1
 				self.last_width = width
 				depth=depth-1
 				self.last_shape=[-1,width] # dense

 	# Convolution Layer
 	def conv(self,shape,act=tf.nn.relu,pool=True,dropout=False,norm=True,name=None): # True why dropout bad in tensorflow??
 		with tf.name_scope('conv'):
 			print("input  shape ",self.last_shape)
 			print("conv   shape ",shape)
 			width=shape[-1]
 			# filters = tf.Variable(tf.random_uniform(shape, minval=-1. / width, maxval=1. / width), name="filters")
 			filters=tf.Variable(tf.random_normal(shape)) # positive weights help with image classification
 			_bias=tf.Variable(tf.random_normal([shape[-1]]))
 			conv1=tf.nn.bias_add(tf.nn.conv2d(self.last_layer,filter=filters, strides=[1, 1, 1, 1], padding='SAME'), _bias)
 			if debug: tf.histogram_summary('conv_' + str(len(self.layers)), conv1)
 			if act: conv1=act(conv1)
 			if pool: conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
 			if norm: conv1 = tf.nn.lrn(conv1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
 			if debug: tf.histogram_summary('norm_' + str(len(self.layers)), conv1)
 			if dropout: conv1 = tf.nn.dropout(conv1,self.keep_prob)
 			print("output shape ",conv1.get_shape())
 			self.add(conv1)

 	def classifier(self,classes=10):
 		""" Define loss and optimizer """
 		with tf.name_scope('prediction'):# prediction
 			if self.last_width!=classes:
 				# print("Automatically adding dense prediction")
 				self.dense(hidden=classes, act= False, dropout = False)

 		with tf.name_scope('classifier'):
 			y_=self.target
 			manual_cost_formula=False # True
 			if manual_cost_formula:
 				# prediction = y =self.last_layer=tf.nn.softmax(self.last_layer)
 				# self.cost = cross_entropy = -tf.reduce_sum(y_ * tf.log(y+ 1e-10)) # against NaN!
 				prediction = y = tf.nn.log_softmax(self.last_layer)
 				self.cost = cross_entropy = -tf.reduce_sum(y_ * y)
 			elif classes>100:
 				print("using sampled_softmax_loss")
 				y=prediction=self.last_layer
 				self.cost = tf.reduce_mean(tf.nn.sampled_softmax_loss(y, y_))  # for big vocab
 			else:
 				y = prediction = self.last_layer
 				self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_)) # prediction, target

 			with tf.device(_cpu):tf.scalar_summary('cost', self.cost)
 			# self.cost = tf.Print(self.cost , [self.cost ], "debug cost : ")
 			# learning_scheme=self.learning_rate
 			learning_scheme=tf.train.exponential_decay(self.learning_rate, self.global_step, decay_step, decay_size)
 			self.optimizer = tf.train.AdamOptimizer(learning_scheme).minimize(self.cost)

 			# Evaluate model
 			correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(self.target, 1))
 			self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
 			if not force_gpu: tf.scalar_summary('accuracy', self.accuracy)


 	def addLayer(self, nChannels, nOutChannels, dropout):
 		ident=self.last_layer
 		self.batchnorm()
 		# self.add(tf.nn.relu(ident)) # nChannels ?
 		self.conv([3,3,nChannels,nOutChannels], pool=False, dropout=dropout, norm=tf.nn.relu)#None
 		concat = tf.concat(3, [ident, self.last_layer])
 		print("concat ",concat.get_shape())
 		self.add(concat)

 	def addTransition(self, nChannels, nOutChannels, dropout):
 		self.batchnorm()
 		self.add(tf.nn.relu(self.last_layer))
 		self.conv([1,1, nChannels, nOutChannels], pool=True, dropout=dropout, norm=None) # pool (2, 2)

 	def buildDenseConv(self):
 		depth = 3 * 1 + 4
 		if  (depth - 4) % 3 :  raise Exception("Depth must be 3N + 4! (4,7,10,...) ")  # # layers in each denseblock
 		N = (depth - 4) / 3
 		# channels before entering the first denseblock
 		# set it to be comparable with growth rate ?
 		nChannels = 16
 		growthRate = 12
 		self.conv([3,3,1,nChannels]) # prepare 16 filters with 3x3 view -> 28x28 just as input

 		for i in range(N):  # 1st block
 			self.addLayer(nChannels, growthRate, dropout)
 			nChannels = nChannels + growthRate
 		self.addTransition(nChannels, nChannels, dropout)

 		for i in range(N): # 2nd block
 			self.addLayer(nChannels, growthRate, dropout)
 			nChannels = nChannels + growthRate
 		self.addTransition(nChannels, nChannels, dropout)

 		for i in range(N): # 3rd block
 			self.addLayer(nChannels, growthRate, dropout)
 			nChannels = nChannels + growthRate
 		# no transition, but densely connected layers.
 		self.batchnorm()
 		self.add(tf.nn.relu(self.last_layer))
 		# self.add(tf.nn.max_pool(self.last_layer, ksize=[1, 2, 2, 1], strides=[1, 1, 1, 1], padding='SAME'))
 		# self.reshape([-1,nChannels*4*4])
 		self.add(tf.nn.max_pool(self.last_layer, ksize=[1, 4, 4, 1], strides=[1, 2, 2, 1], padding='SAME'))
 		self.reshape([-1, nChannels * 4 ])

 	def next_batch(self,batch_size=10):
 		return self.data.train.next_batch(batch_size)

 	def train(self,steps=-1,dropout=None,display_step=10,test_step=200): #epochs=-1,
 		steps = 9999999 if steps==-1 else steps
 		session=self.session
 		# with tf.device(_cpu):

 		# import tensorflow.contrib.layers as layers
 		# t = tf.verify_tensor_all_finite(t, msg)
 		tf.add_check_numerics_ops()
 		self.summaries = tf.merge_all_summaries()
 		self.summary_writer = tf.train.SummaryWriter(tensorboard_logs, session.graph) #
 		if not dropout:dropout=1. # keep all
 		x=self.x
 		y=self.y
 		keep_prob=self.keep_prob
 		session.run([tf.initialize_all_variables()])
 		step = 1 # show first
 		while step < steps:
 			# print("step %d \r" % step)# end=' ')
 			batch_xs, batch_ys = self.next_batch(self.batch_size)

 			# tf.train.shuffle_batch_join(example_list, batch_size, capacity=min_queue_size + batch_size * 16, min_queue_size)
 			# Fit training using batch data
 			feed_dict = {x: batch_xs, y: batch_ys, keep_prob: dropout, self.train_phase: True}
 			loss,_= session.run([self.cost,self.optimizer], feed_dict=feed_dict)
 			if step % test_step == 0: self.test(step)
 			if step % display_step == 0:
 				# Calculate batch accuracy, loss
 				feed = {x: batch_xs, y: batch_ys, keep_prob: 1., self.train_phase: False}
 				acc = session.run(self.accuracy, feed_dict=feed)
 				# acc , summary = session.run([self.accuracy,self.summaries], feed_dict=feed)
 				# self.summary_writer.add_summary(summary, step) # only test summaries for smoother curve
 				print("\rStep {:d} Loss= {:.6f} Accuracy= {:.3f}".format(step,loss,acc),end=' ')
 				if str(loss)=="nan": return print("\nLoss gradiant explosion, exiting!!!") #restore!
 			step += 1
 		print("\nOptimization Finished!")
 		self.test(step,number=10000) # final test

 	def inputs(self,data):
 		self.inputs, self.labels = load_data()#...)

 	def test(self,step,number=400):#256
 		session=sess=self.session
 		run_metadata = tf.RunMetadata()
 		run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
 		# Calculate accuracy for 256 mnist test images
 		test_labels = self.data.test.labels[:number]
 		test_images = self.data.test.images[:number]
 		feed_dict = {self.x: test_images, self.y: test_labels, self.keep_prob: 1., self.train_phase:False}
 		accuracy,summary= self.session.run([self.accuracy, self.summaries], feed_dict=feed_dict)
 		# accuracy,summary = session.run([self.accuracy, self.summaries], feed_dict, run_options, run_metadata)
 		print('\t'*3+"Test Accuracy:",accuracy)
 		# self.summary_writer.add_run_metadata(run_metadata, 'step #%03d' % step)
 		self.summary_writer.add_summary(summary,global_step=step)


 def dense(net): # best with lr ~0.001
 	# type: (layer.net) -> None
 	# net.batchnorm() # start lower, else no effect
 	# net.dense(400,act=None)#  # ~95% we can do better:
 	net.dense(400, act=tf.nn.tanh)# 0.996 YAY  only 0.985 on full set, Step 5000 flat
 	return # 0.957% without any model!!

 def alex(net):
 	# type: (layer.net) -> None
 	print("Building Alex-net")
 	net.reshape(shape=[-1, 28, 28, 1])  # Reshape input pictures
 	# net.batchnorm()
 	net.conv([3, 3, 1, 64])
 	net.conv([3, 3, 64, 128])
 	net.conv([3, 3, 128, 256])
 	net.dense(1024,act=tf.nn.relu)
 	net.dense(1024,act=tf.nn.relu)



 # OH, it does converge!!
 def denseConv(net):
 	# type: (layer.net) -> None
 	print("Building dense-net")
 	net.reshape(shape=[-1, 28, 28, 1])  # Reshape input picture
 	# net.batchnorm()
 	# net.conv([3, 3, 1, 64])
 	net.buildDenseConv()
 	net.classifier() # 10 classes auto



 # net=net(dense,data=mnist, learning_rate=0.01 )#,'mnist' baseline
 # _net=net(alex,data=mnist, learning_rate=0.001)#,'mnist'
 _net=net(model=denseConv,data=mnist, learning_rate=0.001)
 # _net.train(50000,dropout=keep_rate ,display_step=1,test_step=1) # debug
 _net.train(50000, dropout=dropout, display_step=1, test_step=20) # gpu
	#!/usr/bin/python
	from __future__ import print_function
	import os
	import numpy as np
	import tensorflow as tf
	from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets

	# mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
	mnist = read_data_sets("/tmp/data/", one_hot=True)

	force_gpu = False
	debug = False # histogram_summary ...
	# _cpu='/cpu:0'
	default_learning_rate=0.01
	decay_step = 3000
	decay_size = 0.95
	# dropout = 0.6
	dropout = None #None to disable dropout, non - zero number to enable dropout and set keep rate

	batch_size=64
	_cpu='/cpu:0'
	tensorboard_logs = '/tmp/tensorboard-logs/'

	# $(sleep 5; open http://0.0.0.0:6006) & tensorboard --debug --logdir=/tmp/tensorboard-logs/
	class net():

	def input_width(self,data):
	return 28*28

	def __init__(self,model,data,name=0,learning_rate=default_learning_rate,batch_size=batch_size):
	# device = '/GPU:0' if gpu else '/cpu:0'
	# device =None # auto
	# print("Using device ",device)
	# with tf.device(device):
	if True:
	self.session=sess=session=tf.Session()
	# self.session=sess=session=tf.Session(config=tf.ConfigProto(log_device_placement=True))
	self.model=model
	self.data=data # assigned to self.x=net.input via train
	self.batch_size=batch_size
	self.layers=[]
	self.last_width=self.input_width(data)
	self.learning_rate=learning_rate

	# if not name: name=model.__name__
	# if name and os.path.exists(name):
	# return self.load_model(name)
	self.generate_model(model)

	def generate_model(self,model, name=''):
	if not model: return self
	with tf.name_scope('state'):
	self.keep_prob = tf.placeholder(tf.float32) # 1 for testing! else 1 - dropout
	self.train_phase = tf.placeholder(tf.bool, name='train_phase')
	self.global_step = tf.Variable(0) # dont set, feed or increment global_step, tensorflow will do it automatically
	with tf.name_scope('data'):
	n_input=28*28
	n_classes=10
	self.x = x = self.input = tf.placeholder(tf.float32, [None, n_input])
	self.last_layer=x
	self.y = y = self.target = tf.placeholder(tf.float32, [None, n_classes])
	if not force_gpu: tf.image_summary("mnist", tf.reshape(self.x, [-1, 28, 28, 1], "mnist_images"))
	with tf.name_scope('model'):
	model(self)
	if(self.last_width!=n_classes): self.classifier() # 10 classes auto


	def add(self, layer):
	self.layers.append(layer)
	self.last_layer = layer
	self.last_shape = layer.get_shape()

	def reshape(self,shape):
	self.last_layer = tf.reshape(self.last_layer,shape)
	self.last_shape = shape
	self.last_width = shape[-1]

	def batchnorm(self):
	from tensorflow.contrib.layers.python.layers import batch_norm as batch_norm
	with tf.name_scope('batchnorm') as scope:
	input = self.last_layer
	# mean, var = tf.nn.moments(input, axes=[0, 1, 2])
	# self.batch_norm = tf.nn.batch_normalization(input, mean, var, offset=1, scale=1, variance_epsilon=1e-6)
	# self.last_layer=self.batch_norm
	train_op=batch_norm(input, is_training=True, center=False, updates_collections=None, scope=scope)
	test_op=batch_norm(input, is_training=False, updates_collections=None, center=False,scope=scope, reuse=True)
	self.add(tf.cond(self.train_phase,lambda:train_op,lambda:test_op))

	# Fully connected layer
	def dense(self, hidden=1024, depth=1, act=tf.nn.tanh, dropout=False, parent=-1): #
	if parent==-1: parent=self.last_layer
	shape = self.last_layer.get_shape()
	if shape and len(shape)>2:
	self.last_width= int(shape[1]shape[2]shape[3])
	print("reshapeing ",shape,"to",self.last_width)
	parent = tf.reshape(parent, [-1, self.last_width])

	width = hidden
	while depth>0:
	with tf.name_scope('Dense_{:d}'.format(hidden)) as scope:
	print("Dense ", self.last_width, width)
	nr = len(self.layers)
	if self.last_width == width:
	U = closest_unitary(np.random.rand(self.last_width, width) / (self.last_width + width))
	weights = tf.Variable(U, name="weights_dense_" + str(nr))
	else:
	weights = tf.Variable(tf.random_uniform([self.last_width, width], minval=-1. / width, maxval=1. / width), name="weights_dense")
	bias = tf.Variable(tf.random_uniform([width],minval=-1./width,maxval=1./width), name="bias_dense")
	dense1 = tf.matmul(parent, weights, name='dense_'+str(nr))+ bias
	tf.histogram_summary('dense_'+str(nr),dense1)
	tf.histogram_summary('weights_'+str(nr),weights)
	tf.histogram_summary('bias_'+str(nr),bias)
	tf.histogram_summary('dense_'+str(nr)+'/sparsity', tf.nn.zero_fraction(dense1))
	tf.histogram_summary('weights_'+str(nr)+'/sparsity', tf.nn.zero_fraction(weights))
	if act: dense1 = act(dense1)
	# if norm: dense1 = self.norm(dense1,lsize=1) # SHAPE!
	if dropout: dense1 = tf.nn.dropout(dense1, self.keep_prob)
	self.layers.append(dense1)
	self.last_layer = parent = dense1
	self.last_width = width
	depth=depth-1
	self.last_shape=[-1,width] # dense

	# Convolution Layer
	def conv(self,shape,act=tf.nn.relu,pool=True,dropout=False,norm=True,name=None): # True why dropout bad in tensorflow??
	with tf.name_scope('conv'):
	print("input shape ",self.last_shape)
	print("conv shape ",shape)
	width=shape[-1]
	# filters = tf.Variable(tf.random_uniform(shape, minval=-1. / width, maxval=1. / width), name="filters")
	filters=tf.Variable(tf.random_normal(shape)) # positive weights help with image classification
	_bias=tf.Variable(tf.random_normal([shape[-1]]))
	conv1=tf.nn.bias_add(tf.nn.conv2d(self.last_layer,filter=filters, strides=[1, 1, 1, 1], padding='SAME'), _bias)
	if debug: tf.histogram_summary('conv_' + str(len(self.layers)), conv1)
	if act: conv1=act(conv1)
	if pool: conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
	if norm: conv1 = tf.nn.lrn(conv1, depth_radius=4, bias=1.0, alpha=0.001 / 9.0, beta=0.75)
	if debug: tf.histogram_summary('norm_' + str(len(self.layers)), conv1)
	if dropout: conv1 = tf.nn.dropout(conv1,self.keep_prob)
	print("output shape ",conv1.get_shape())
	self.add(conv1)

	def classifier(self,classes=10):
	""" Define loss and optimizer """
	with tf.name_scope('prediction'):# prediction
	if self.last_width!=classes:
	# print("Automatically adding dense prediction")
	self.dense(hidden=classes, act= False, dropout = False)

	with tf.name_scope('classifier'):
	y_=self.target
	manual_cost_formula=False # True
	if manual_cost_formula:
	# prediction = y =self.last_layer=tf.nn.softmax(self.last_layer)
	# self.cost = cross_entropy = -tf.reduce_sum(y_ * tf.log(y+ 1e-10)) # against NaN!
	prediction = y = tf.nn.log_softmax(self.last_layer)
	self.cost = cross_entropy = -tf.reduce_sum(y_ * y)
	elif classes>100:
	print("using sampled_softmax_loss")
	y=prediction=self.last_layer
	self.cost = tf.reduce_mean(tf.nn.sampled_softmax_loss(y, y_)) # for big vocab
	else:
	y = prediction = self.last_layer
	self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y, y_)) # prediction, target

	with tf.device(_cpu):tf.scalar_summary('cost', self.cost)
	# self.cost = tf.Print(self.cost , [self.cost ], "debug cost : ")
	# learning_scheme=self.learning_rate
	learning_scheme=tf.train.exponential_decay(self.learning_rate, self.global_step, decay_step, decay_size)
	self.optimizer = tf.train.AdamOptimizer(learning_scheme).minimize(self.cost)

	# Evaluate model
	correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(self.target, 1))
	self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
	if not force_gpu: tf.scalar_summary('accuracy', self.accuracy)


	def addLayer(self, nChannels, nOutChannels, dropout):
	ident=self.last_layer
	self.batchnorm()
	# self.add(tf.nn.relu(ident)) # nChannels ?
	self.conv([3,3,nChannels,nOutChannels], pool=False, dropout=dropout, norm=tf.nn.relu)#None
	concat = tf.concat(3, [ident, self.last_layer])
	print("concat ",concat.get_shape())
	self.add(concat)

	def addTransition(self, nChannels, nOutChannels, dropout):
	self.batchnorm()
	self.add(tf.nn.relu(self.last_layer))
	self.conv([1,1, nChannels, nOutChannels], pool=True, dropout=dropout, norm=None) # pool (2, 2)

	def buildDenseConv(self):
	depth = 3 * 1 + 4
	if (depth - 4) % 3 : raise Exception("Depth must be 3N + 4! (4,7,10,...) ") # # layers in each denseblock
	N = (depth - 4) / 3
	# channels before entering the first denseblock
	# set it to be comparable with growth rate ?
	nChannels = 16
	growthRate = 12
	self.conv([3,3,1,nChannels]) # prepare 16 filters with 3x3 view -> 28x28 just as input

	for i in range(N): # 1st block
	self.addLayer(nChannels, growthRate, dropout)
	nChannels = nChannels + growthRate
	self.addTransition(nChannels, nChannels, dropout)

	for i in range(N): # 2nd block
	self.addLayer(nChannels, growthRate, dropout)
	nChannels = nChannels + growthRate
	self.addTransition(nChannels, nChannels, dropout)

	for i in range(N): # 3rd block
	self.addLayer(nChannels, growthRate, dropout)
	nChannels = nChannels + growthRate
	# no transition, but densely connected layers.
	self.batchnorm()
	self.add(tf.nn.relu(self.last_layer))
	# self.add(tf.nn.max_pool(self.last_layer, ksize=[1, 2, 2, 1], strides=[1, 1, 1, 1], padding='SAME'))
	# self.reshape([-1,nChannels44])
	self.add(tf.nn.max_pool(self.last_layer, ksize=[1, 4, 4, 1], strides=[1, 2, 2, 1], padding='SAME'))
	self.reshape([-1, nChannels * 4 ])

	def next_batch(self,batch_size=10):
	return self.data.train.next_batch(batch_size)

	def train(self,steps=-1,dropout=None,display_step=10,test_step=200): #epochs=-1,
	steps = 9999999 if steps==-1 else steps
	session=self.session
	# with tf.device(_cpu):

	# import tensorflow.contrib.layers as layers
	# t = tf.verify_tensor_all_finite(t, msg)
	tf.add_check_numerics_ops()
	self.summaries = tf.merge_all_summaries()
	self.summary_writer = tf.train.SummaryWriter(tensorboard_logs, session.graph) #
	if not dropout:dropout=1. # keep all
	x=self.x
	y=self.y
	keep_prob=self.keep_prob
	session.run([tf.initialize_all_variables()])
	step = 1 # show first
	while step < steps:
	# print("step %d \r" % step)# end=' ')
	batch_xs, batch_ys = self.next_batch(self.batch_size)

	# tf.train.shuffle_batch_join(example_list, batch_size, capacity=min_queue_size + batch_size * 16, min_queue_size)
	# Fit training using batch data
	feed_dict = {x: batch_xs, y: batch_ys, keep_prob: dropout, self.train_phase: True}
	loss,_= session.run([self.cost,self.optimizer], feed_dict=feed_dict)
	if step % test_step == 0: self.test(step)
	if step % display_step == 0:
	# Calculate batch accuracy, loss
	feed = {x: batch_xs, y: batch_ys, keep_prob: 1., self.train_phase: False}
	acc = session.run(self.accuracy, feed_dict=feed)
	# acc , summary = session.run([self.accuracy,self.summaries], feed_dict=feed)
	# self.summary_writer.add_summary(summary, step) # only test summaries for smoother curve
	print("\rStep {:d} Loss= {:.6f} Accuracy= {:.3f}".format(step,loss,acc),end=' ')
	if str(loss)=="nan": return print("\nLoss gradiant explosion, exiting!!!") #restore!
	step += 1
	print("\nOptimization Finished!")
	self.test(step,number=10000) # final test

	def inputs(self,data):
	self.inputs, self.labels = load_data()#...)

	def test(self,step,number=400):#256
	session=sess=self.session
	run_metadata = tf.RunMetadata()
	run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
	# Calculate accuracy for 256 mnist test images
	test_labels = self.data.test.labels[:number]
	test_images = self.data.test.images[:number]
	feed_dict = {self.x: test_images, self.y: test_labels, self.keep_prob: 1., self.train_phase:False}
	accuracy,summary= self.session.run([self.accuracy, self.summaries], feed_dict=feed_dict)
	# accuracy,summary = session.run([self.accuracy, self.summaries], feed_dict, run_options, run_metadata)
	print('\t'*3+"Test Accuracy:",accuracy)
	# self.summary_writer.add_run_metadata(run_metadata, 'step #%03d' % step)
	self.summary_writer.add_summary(summary,global_step=step)


	def dense(net): # best with lr ~0.001
	# type: (layer.net) -> None
	# net.batchnorm() # start lower, else no effect
	# net.dense(400,act=None)# # ~95% we can do better:
	net.dense(400, act=tf.nn.tanh)# 0.996 YAY only 0.985 on full set, Step 5000 flat
	return # 0.957% without any model!!

	def alex(net):
	# type: (layer.net) -> None
	print("Building Alex-net")
	net.reshape(shape=[-1, 28, 28, 1]) # Reshape input pictures
	# net.batchnorm()
	net.conv([3, 3, 1, 64])
	net.conv([3, 3, 64, 128])
	net.conv([3, 3, 128, 256])
	net.dense(1024,act=tf.nn.relu)
	net.dense(1024,act=tf.nn.relu)



	# OH, it does converge!!
	def denseConv(net):
	# type: (layer.net) -> None
	print("Building dense-net")
	net.reshape(shape=[-1, 28, 28, 1]) # Reshape input picture
	# net.batchnorm()
	# net.conv([3, 3, 1, 64])
	net.buildDenseConv()
	net.classifier() # 10 classes auto



	# net=net(dense,data=mnist, learning_rate=0.01 )#,'mnist' baseline
	# _net=net(alex,data=mnist, learning_rate=0.001)#,'mnist'
	_net=net(model=denseConv,data=mnist, learning_rate=0.001)
	# _net.train(50000,dropout=keep_rate ,display_step=1,test_step=1) # debug
	_net.train(50000, dropout=dropout, display_step=1, test_step=20) # gpu