Last active
August 19, 2018 03:09
-
-
Save pekaalto/1549f5dd3d43dc55de2c0d91e857164e to your computer and use it in GitHub Desktop.
Example of two equivalent methods of embedding categorical features on channel dimension
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Pekka Aalto 2017 | |
This snippet tries to explain by example what deepmind means | |
in https://arxiv.org/abs/1708.04782 | |
about embedding on channel axis being equivalent to | |
one-hot-encoding followed by 1x1 conv. | |
They write: | |
"We embed all feature layers containing categorical values | |
into a continuous space which is equivalent to using | |
a one-hot encoding in the channel dimension | |
followed by a 1 ⇥ 1 convolution." | |
However, there are still two possible ways to do this. Either | |
1) concatenating | |
or | |
2) summing | |
the embeddings of each feature on channel dimension. | |
As mentioned in the deepmind-paper, both can be done as | |
- embedding lookup | |
or | |
- one-hot -> 1x1 | |
However, we still don't know if they eventually concatenated or summed the embeddings. | |
""" | |
import tensorflow as tf | |
from tensorflow.contrib import layers | |
import numpy as np | |
#### CONSTANTS | |
embedding_dim = 8 | |
H = W = 32 | |
n_cat_by_feature = [3, 10, 2] # so 3 categorical features e.g player_relative, unit_type, is_creep | |
# will use this to set the weights for every category in every methodology | |
initial_emb_weights = [np.random.rand(n, embedding_dim) for n in n_cat_by_feature] | |
# the actual features | |
features = [ | |
tf.placeholder(shape=[H, W], dtype="int32", name="feat%d" % i) | |
for i, _ in enumerate(n_cat_by_feature) | |
] | |
# 1.1) embed on channel -> concat on channel | |
embedded1 = [] | |
for f, n, w in zip(features, n_cat_by_feature, initial_emb_weights): | |
e = layers.embed_sequence( | |
f, | |
vocab_size=n, | |
embed_dim=embedding_dim, | |
initializer=tf.constant_initializer(w) | |
) | |
embedded1.append(e) | |
out11 = tf.concat(embedded1, axis=2) | |
# 1.2) onehot on channel -> 1x1 conv separately -> concat on channel | |
embedded2 = [] | |
for f, n, w in zip(features, n_cat_by_feature, initial_emb_weights): | |
one_hot = layers.one_hot_encoding(f, num_classes=n) | |
conv_out = layers.conv2d( | |
inputs=one_hot, | |
num_outputs=embedding_dim, | |
weights_initializer=tf.constant_initializer(w), | |
kernel_size=1, | |
stride=1 | |
) | |
embedded2.append(conv_out) | |
out12 = tf.concat(embedded2, axis=2) | |
# 2.1) sum embeddings on channel instead of concatenating | |
out21 = tf.add_n(embedded1) | |
# 2.2) onehot on channel -> concat on channel -> 1x1 conv | |
one_hotted_features = tf.concat([ | |
layers.one_hot_encoding(f, num_classes=n) | |
for f, n in zip(features, n_cat_by_feature) | |
], axis=2) | |
out22 = layers.conv2d( | |
inputs=one_hotted_features, | |
num_outputs=embedding_dim, | |
weights_initializer=tf.constant_initializer(np.concatenate(initial_emb_weights)), | |
kernel_size=1, | |
stride=1 | |
) | |
print(out11) | |
print(out12) | |
print(out21) | |
print(out22) | |
# let's try it: | |
feed_dict = { | |
ph: np.random.randint(n, size=(H,W)) for ph,n in zip(features, n_cat_by_feature) | |
} | |
sess = tf.InteractiveSession() | |
sess.run(tf.global_variables_initializer()) | |
a11, a12, a21, a22 = sess.run([out11, out12, out21, out22], feed_dict=feed_dict) | |
#the result is indeed the same | |
assert np.all(np.abs(a11 - a12) < 1e-6) | |
assert np.all(np.abs(a21 - a22) < 1e-6) | |
#the result is not trivial | |
assert np.abs(a11).sum() > 1.0 | |
assert np.abs(a21).sum() > 1.0 | |
print("Phew it indeed worked. Good bye.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment