Last active
February 19, 2017 01:40
-
-
Save act65/aa3294e486bfe62eebba3a63ff405a15 to your computer and use it in GitHub Desktop.
Random projection -> binary hashing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
from tensorflow.contrib.tensorboard.plugins import projector | |
from tensorflow.examples.tutorials.mnist import input_data | |
from sklearn.utils import shuffle | |
import numpy as np | |
import os | |
tf.app.flags.DEFINE_string('logdir', '/tmp/test', 'location for saved embeedings') | |
tf.app.flags.DEFINE_string('datadir', '/tmp/mnist', 'location for data') | |
tf.app.flags.DEFINE_integer('batchsize', 500, 'batch size.') | |
tf.app.flags.DEFINE_integer('bits', 10, 'num of bits') | |
FLAGS = tf.app.flags.FLAGS | |
# TODO how to optimise the weights to give us evenly sized bins? | |
# TODO save visualiser/embedding and port into (my) website? | |
# https://arxiv.org/abs/1511.05212 | |
# https://arxiv.org/abs/1701.02815 | |
def batch(ims, labels, batchsize): | |
ims, labels = shuffle(ims, labels, random_state=0) | |
shape = ims.shape | |
for i in range(10000//batchsize): | |
yield (i, ims[i*batchsize:(i+1)*batchsize, ...], | |
labels[i*batchsize:(i+1)*batchsize, ...]) | |
# TODO. how can we 'design' W to have properties a priori? | |
# - use orthogonal random values? | |
# - use a tensor decomposition to help compress the number of variables? | |
# - could even quantise for more compression? | |
# class TT(): | |
# def __init__(self, ?): | |
# self.weights = [tf.get_variable(shape=[a, b, c], name='W_'+str(i)) for i in range(10)] | |
# | |
# def matmul(self, x): | |
# # because everything is linear we can do the multiplication without | |
# # constructing the tensor | |
# for w in self.weights: | |
# ? | |
# | |
# def construct(): | |
# pass | |
def simhash(x): | |
W = tf.get_variable(shape=[784, FLAGS.bits], name='W') | |
# TODO use TT decomposition and do mat.vec mul in TT format | |
# (for very big inputs or large feature spaces) | |
# what if feature space ~= num of datapoints. -> onehot encoding | |
# this is memory? but learnable? how much work do we need to do to | |
# even up the buckets? will need to regularise for sparsity? | |
# TODO Because this is so cheap we could fit to entire dataset? | |
# want; | |
# - equal sized buckets | |
# - local neighborhood preserved | |
# - ? | |
z = tf.matmul(x, W) | |
return tf.cast(tf.greater(z, 0), tf.float32) | |
# TODO what are some other super simple algoithms? | |
# - hash a single global feature into multiple buckets. e.g. avg pixel | |
# - | |
def get_embeddings(ims, labels): | |
# preserves cosine distance | |
x = tf.placeholder(shape=[None, 784], dtype=tf.float32) | |
y = tf.placeholder(shape=[None], dtype=tf.int32) | |
h = simhash(x) | |
H = []; L = [] | |
with tf.Session() as sess: | |
sess.run(tf.global_variables_initializer()) | |
for i, batch_ims, batch_labels in batch(ims, labels, FLAGS.batchsize): | |
print('step {}'.format(i), end='', flush=True) | |
H.append(sess.run(h, feed_dict={x: batch_ims, y: batch_labels})) | |
L.append(batch_labels) | |
return np.vstack(H), np.vstack(L).reshape((10000)) | |
def save_embeddings(embeddings, labels): | |
""" | |
Args: | |
embeddings: A numpy array of shape (10000, features) and type float32. | |
labels: a numpy array of int32's. (10000,) | |
""" | |
tf.reset_default_graph() | |
with tf.Session() as sess: | |
embed_var = tf.Variable(embeddings, name='embeddings') | |
sess.run(embed_var.initializer) | |
saver = tf.train.Saver(var_list=[embed_var]) | |
os.makedirs(FLAGS.logdir, exist_ok=True) | |
fname = saver.save(sess, os.path.join(FLAGS.logdir, 'model.ckpt'), | |
write_meta_graph=False) | |
print('saved to {}'.format(fname)) | |
print(' {}'.format(embed_var.get_shape())) | |
summary_writer = tf.train.SummaryWriter(FLAGS.logdir) | |
config = projector.ProjectorConfig() | |
embedding = config.embeddings.add() | |
embedding.tensor_name = embed_var.name | |
embedding.metadata_path = os.path.join(FLAGS.logdir, 'metadata.tsv') | |
# summary_writer.add_graph(sess.graph) | |
projector.visualize_embeddings(summary_writer, config) | |
# write labels. | |
with open(os.path.join(FLAGS.logdir, 'metadata.tsv'), 'w') as metadata_file: | |
metadata_file.write('Name\tClass\n') | |
print('labels', labels.shape) | |
for i, L in enumerate(labels): | |
metadata_file.write('%06d\t%s\n' % (i, L)) | |
def main(_): | |
print('Get data') | |
mnist = input_data.read_data_sets(FLAGS.datadir, one_hot=False) | |
ims = np.reshape(mnist.train.images, [-1, 28*28]).astype(np.float32) | |
labels = np.reshape(mnist.train.labels, [-1]).astype(np.int32) | |
print('Get embeddings') | |
embeddings, associated_labels = get_embeddings(ims, labels) | |
print('Save embeddings') | |
save_embeddings(embeddings, associated_labels) | |
if __name__ == '__main__': | |
tf.app.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment