act65 · February 19, 2017 01:40
diff --git a/hash.py b/hash.py
 import tensorflow as tf
 from tensorflow.contrib.tensorboard.plugins import projector
 from tensorflow.examples.tutorials.mnist import input_data
 from sklearn.utils import shuffle
 import numpy as np
 import os

 tf.app.flags.DEFINE_string('logdir', '/tmp/test', 'location for saved embeedings')
 tf.app.flags.DEFINE_string('datadir', '/tmp/mnist', 'location for data')
 tf.app.flags.DEFINE_integer('batchsize', 500, 'batch size.')
 tf.app.flags.DEFINE_integer('bits', 10, 'num of bits')
 FLAGS = tf.app.flags.FLAGS

 # TODO how to optimise the weights to give us evenly sized bins?
 # TODO save visualiser/embedding and port into (my) website?

 # https://arxiv.org/abs/1511.05212
 # https://arxiv.org/abs/1701.02815

 def batch(ims, labels, batchsize):
    ims, labels = shuffle(ims, labels, random_state=0)
    shape = ims.shape
    for i in range(10000//batchsize):
        yield (i, ims[i*batchsize:(i+1)*batchsize, ...],
               labels[i*batchsize:(i+1)*batchsize, ...])


 # TODO. how can we 'design' W to have properties a priori?
 # - use orthogonal random values?
 # - use a tensor decomposition to help compress the number of variables?
 # - could even quantise for more compression?


 # class TT():
 #     def __init__(self, ?):
 #         self.weights = [tf.get_variable(shape=[a, b, c], name='W_'+str(i)) for i in range(10)]
 #
 #     def matmul(self, x):
 #         # because everything is linear we can do the multiplication without
 #         # constructing the tensor
 #         for w in self.weights:
 #             ?
 #
 #     def construct():
 #         pass


 def simhash(x):
    W = tf.get_variable(shape=[784, FLAGS.bits], name='W')

    # TODO use TT decomposition and do mat.vec mul in TT format
    # (for very big inputs or large feature spaces)
    # what if feature space ~= num of datapoints. -> onehot encoding
    # this is memory? but learnable? how much work do we need to do to
    # even up the buckets? will need to regularise for sparsity?

    # TODO Because this is so cheap we could fit to entire dataset?
    # want;
    # - equal sized buckets
    # - local neighborhood preserved
    # - ?

    z = tf.matmul(x, W)
    return tf.cast(tf.greater(z, 0), tf.float32)

 # TODO what are some other super simple algoithms?
 # - hash a single global feature into multiple buckets. e.g. avg pixel
 # -


 def get_embeddings(ims, labels):
    # preserves cosine distance
    x = tf.placeholder(shape=[None, 784], dtype=tf.float32)
    y = tf.placeholder(shape=[None], dtype=tf.int32)

    h = simhash(x)

    H = []; L = []
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i, batch_ims, batch_labels in batch(ims, labels, FLAGS.batchsize):
            print('step {}'.format(i), end='', flush=True)
            H.append(sess.run(h, feed_dict={x: batch_ims, y: batch_labels}))
            L.append(batch_labels)
    return np.vstack(H), np.vstack(L).reshape((10000))


 def save_embeddings(embeddings, labels):
    """
    Args:
        embeddings: A numpy array of shape (10000, features) and type float32.
        labels: a numpy array of int32's. (10000,)
    """
    tf.reset_default_graph()
    with tf.Session() as sess:

        embed_var = tf.Variable(embeddings, name='embeddings')
        sess.run(embed_var.initializer)

        saver = tf.train.Saver(var_list=[embed_var])
        os.makedirs(FLAGS.logdir, exist_ok=True)
        fname = saver.save(sess, os.path.join(FLAGS.logdir, 'model.ckpt'),
                           write_meta_graph=False)

    print('saved to {}'.format(fname))
    print('    {}'.format(embed_var.get_shape()))

    summary_writer = tf.train.SummaryWriter(FLAGS.logdir)
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embed_var.name
    embedding.metadata_path = os.path.join(FLAGS.logdir, 'metadata.tsv')

    # summary_writer.add_graph(sess.graph)
    projector.visualize_embeddings(summary_writer, config)

    # write labels.
    with open(os.path.join(FLAGS.logdir, 'metadata.tsv'), 'w') as metadata_file:
        metadata_file.write('Name\tClass\n')
        print('labels', labels.shape)
        for i, L in enumerate(labels):
            metadata_file.write('%06d\t%s\n' % (i, L))


 def main(_):
    print('Get data')
    mnist = input_data.read_data_sets(FLAGS.datadir, one_hot=False)
    ims = np.reshape(mnist.train.images, [-1, 28*28]).astype(np.float32)
    labels = np.reshape(mnist.train.labels, [-1]).astype(np.int32)
    print('Get embeddings')
    embeddings, associated_labels = get_embeddings(ims, labels)
    print('Save embeddings')
    save_embeddings(embeddings, associated_labels)


 if __name__ == '__main__':
    tf.app.run()
	import tensorflow as tf
	from tensorflow.contrib.tensorboard.plugins import projector
	from tensorflow.examples.tutorials.mnist import input_data
	from sklearn.utils import shuffle
	import numpy as np
	import os

	tf.app.flags.DEFINE_string('logdir', '/tmp/test', 'location for saved embeedings')
	tf.app.flags.DEFINE_string('datadir', '/tmp/mnist', 'location for data')
	tf.app.flags.DEFINE_integer('batchsize', 500, 'batch size.')
	tf.app.flags.DEFINE_integer('bits', 10, 'num of bits')
	FLAGS = tf.app.flags.FLAGS

	# TODO how to optimise the weights to give us evenly sized bins?
	# TODO save visualiser/embedding and port into (my) website?

	# https://arxiv.org/abs/1511.05212
	# https://arxiv.org/abs/1701.02815

	def batch(ims, labels, batchsize):
	ims, labels = shuffle(ims, labels, random_state=0)
	shape = ims.shape
	for i in range(10000//batchsize):
	yield (i, ims[ibatchsize:(i+1)batchsize, ...],
	labels[ibatchsize:(i+1)batchsize, ...])


	# TODO. how can we 'design' W to have properties a priori?
	# - use orthogonal random values?
	# - use a tensor decomposition to help compress the number of variables?
	# - could even quantise for more compression?


	# class TT():
	# def __init__(self, ?):
	# self.weights = [tf.get_variable(shape=[a, b, c], name='W_'+str(i)) for i in range(10)]
	#
	# def matmul(self, x):
	# # because everything is linear we can do the multiplication without
	# # constructing the tensor
	# for w in self.weights:
	# ?
	#
	# def construct():
	# pass


	def simhash(x):
	W = tf.get_variable(shape=[784, FLAGS.bits], name='W')

	# TODO use TT decomposition and do mat.vec mul in TT format
	# (for very big inputs or large feature spaces)
	# what if feature space ~= num of datapoints. -> onehot encoding
	# this is memory? but learnable? how much work do we need to do to
	# even up the buckets? will need to regularise for sparsity?

	# TODO Because this is so cheap we could fit to entire dataset?
	# want;
	# - equal sized buckets
	# - local neighborhood preserved
	# - ?

	z = tf.matmul(x, W)
	return tf.cast(tf.greater(z, 0), tf.float32)

	# TODO what are some other super simple algoithms?
	# - hash a single global feature into multiple buckets. e.g. avg pixel
	# -


	def get_embeddings(ims, labels):
	# preserves cosine distance
	x = tf.placeholder(shape=[None, 784], dtype=tf.float32)
	y = tf.placeholder(shape=[None], dtype=tf.int32)

	h = simhash(x)

	H = []; L = []
	with tf.Session() as sess:
	sess.run(tf.global_variables_initializer())
	for i, batch_ims, batch_labels in batch(ims, labels, FLAGS.batchsize):
	print('step {}'.format(i), end='', flush=True)
	H.append(sess.run(h, feed_dict={x: batch_ims, y: batch_labels}))
	L.append(batch_labels)
	return np.vstack(H), np.vstack(L).reshape((10000))


	def save_embeddings(embeddings, labels):
	"""
	Args:
	embeddings: A numpy array of shape (10000, features) and type float32.
	labels: a numpy array of int32's. (10000,)
	"""
	tf.reset_default_graph()
	with tf.Session() as sess:

	embed_var = tf.Variable(embeddings, name='embeddings')
	sess.run(embed_var.initializer)

	saver = tf.train.Saver(var_list=[embed_var])
	os.makedirs(FLAGS.logdir, exist_ok=True)
	fname = saver.save(sess, os.path.join(FLAGS.logdir, 'model.ckpt'),
	write_meta_graph=False)

	print('saved to {}'.format(fname))
	print(' {}'.format(embed_var.get_shape()))

	summary_writer = tf.train.SummaryWriter(FLAGS.logdir)
	config = projector.ProjectorConfig()
	embedding = config.embeddings.add()
	embedding.tensor_name = embed_var.name
	embedding.metadata_path = os.path.join(FLAGS.logdir, 'metadata.tsv')

	# summary_writer.add_graph(sess.graph)
	projector.visualize_embeddings(summary_writer, config)

	# write labels.
	with open(os.path.join(FLAGS.logdir, 'metadata.tsv'), 'w') as metadata_file:
	metadata_file.write('Name\tClass\n')
	print('labels', labels.shape)
	for i, L in enumerate(labels):
	metadata_file.write('%06d\t%s\n' % (i, L))


	def main(_):
	print('Get data')
	mnist = input_data.read_data_sets(FLAGS.datadir, one_hot=False)
	ims = np.reshape(mnist.train.images, [-1, 28*28]).astype(np.float32)
	labels = np.reshape(mnist.train.labels, [-1]).astype(np.int32)
	print('Get embeddings')
	embeddings, associated_labels = get_embeddings(ims, labels)
	print('Save embeddings')
	save_embeddings(embeddings, associated_labels)


	if __name__ == '__main__':
	tf.app.run()