Skip to content

Instantly share code, notes, and snippets.

@act65
Last active February 19, 2017 01:40
Show Gist options
  • Save act65/aa3294e486bfe62eebba3a63ff405a15 to your computer and use it in GitHub Desktop.
Save act65/aa3294e486bfe62eebba3a63ff405a15 to your computer and use it in GitHub Desktop.
Random projection -> binary hashing
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
from tensorflow.examples.tutorials.mnist import input_data
from sklearn.utils import shuffle
import numpy as np
import os
tf.app.flags.DEFINE_string('logdir', '/tmp/test', 'location for saved embeedings')
tf.app.flags.DEFINE_string('datadir', '/tmp/mnist', 'location for data')
tf.app.flags.DEFINE_integer('batchsize', 500, 'batch size.')
tf.app.flags.DEFINE_integer('bits', 10, 'num of bits')
FLAGS = tf.app.flags.FLAGS
# TODO how to optimise the weights to give us evenly sized bins?
# TODO save visualiser/embedding and port into (my) website?
# https://arxiv.org/abs/1511.05212
# https://arxiv.org/abs/1701.02815
def batch(ims, labels, batchsize):
ims, labels = shuffle(ims, labels, random_state=0)
shape = ims.shape
for i in range(10000//batchsize):
yield (i, ims[i*batchsize:(i+1)*batchsize, ...],
labels[i*batchsize:(i+1)*batchsize, ...])
# TODO. how can we 'design' W to have properties a priori?
# - use orthogonal random values?
# - use a tensor decomposition to help compress the number of variables?
# - could even quantise for more compression?
# class TT():
# def __init__(self, ?):
# self.weights = [tf.get_variable(shape=[a, b, c], name='W_'+str(i)) for i in range(10)]
#
# def matmul(self, x):
# # because everything is linear we can do the multiplication without
# # constructing the tensor
# for w in self.weights:
# ?
#
# def construct():
# pass
def simhash(x):
W = tf.get_variable(shape=[784, FLAGS.bits], name='W')
# TODO use TT decomposition and do mat.vec mul in TT format
# (for very big inputs or large feature spaces)
# what if feature space ~= num of datapoints. -> onehot encoding
# this is memory? but learnable? how much work do we need to do to
# even up the buckets? will need to regularise for sparsity?
# TODO Because this is so cheap we could fit to entire dataset?
# want;
# - equal sized buckets
# - local neighborhood preserved
# - ?
z = tf.matmul(x, W)
return tf.cast(tf.greater(z, 0), tf.float32)
# TODO what are some other super simple algoithms?
# - hash a single global feature into multiple buckets. e.g. avg pixel
# -
def get_embeddings(ims, labels):
# preserves cosine distance
x = tf.placeholder(shape=[None, 784], dtype=tf.float32)
y = tf.placeholder(shape=[None], dtype=tf.int32)
h = simhash(x)
H = []; L = []
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i, batch_ims, batch_labels in batch(ims, labels, FLAGS.batchsize):
print('step {}'.format(i), end='', flush=True)
H.append(sess.run(h, feed_dict={x: batch_ims, y: batch_labels}))
L.append(batch_labels)
return np.vstack(H), np.vstack(L).reshape((10000))
def save_embeddings(embeddings, labels):
"""
Args:
embeddings: A numpy array of shape (10000, features) and type float32.
labels: a numpy array of int32's. (10000,)
"""
tf.reset_default_graph()
with tf.Session() as sess:
embed_var = tf.Variable(embeddings, name='embeddings')
sess.run(embed_var.initializer)
saver = tf.train.Saver(var_list=[embed_var])
os.makedirs(FLAGS.logdir, exist_ok=True)
fname = saver.save(sess, os.path.join(FLAGS.logdir, 'model.ckpt'),
write_meta_graph=False)
print('saved to {}'.format(fname))
print(' {}'.format(embed_var.get_shape()))
summary_writer = tf.train.SummaryWriter(FLAGS.logdir)
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = embed_var.name
embedding.metadata_path = os.path.join(FLAGS.logdir, 'metadata.tsv')
# summary_writer.add_graph(sess.graph)
projector.visualize_embeddings(summary_writer, config)
# write labels.
with open(os.path.join(FLAGS.logdir, 'metadata.tsv'), 'w') as metadata_file:
metadata_file.write('Name\tClass\n')
print('labels', labels.shape)
for i, L in enumerate(labels):
metadata_file.write('%06d\t%s\n' % (i, L))
def main(_):
print('Get data')
mnist = input_data.read_data_sets(FLAGS.datadir, one_hot=False)
ims = np.reshape(mnist.train.images, [-1, 28*28]).astype(np.float32)
labels = np.reshape(mnist.train.labels, [-1]).astype(np.int32)
print('Get embeddings')
embeddings, associated_labels = get_embeddings(ims, labels)
print('Save embeddings')
save_embeddings(embeddings, associated_labels)
if __name__ == '__main__':
tf.app.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment