Last active
October 31, 2021 01:27
-
-
Save helinwang/52512c7574c47c56af77d9107385ae2e to your computer and use it in GitHub Desktop.
A survey of TensorFlow feature columns
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tensorflow.keras import layers | |
import tensorflow as tf | |
import numpy as np | |
video_id = tf.feature_column.categorical_column_with_identity( | |
key="video_id", num_buckets=1000000, default_value=0 | |
) | |
features = { | |
"video_id": tf.sparse.from_dense([[2, 85, 0, 0, 0], [33, 78, 2, 73, 1]]), | |
"year": tf.constant([[1.0, 2.0], [3.0, 4.0]]), | |
"number_str": tf.sparse.from_dense( | |
[["2", "85", "", "", ""], ["33", "78", "2", "73", "1"]] | |
), | |
"keywords": tf.constant( | |
[ | |
["Tensorflow", "Keras", "RNN"], | |
["LSTM", "CNN", "Tensorflow"], | |
] | |
), | |
"latitude": tf.constant([33.7]), | |
"longitude": tf.constant([-84.3]), | |
} | |
print(tf.sparse.from_dense([[2, 85, 0, 0, 0], [33, 78, 2, 73, 1]])) | |
print(video_id._transform_feature(features)) | |
print(video_id._transform_input_tensor(features["video_id"])) | |
numeric_feature_column = tf.feature_column.numeric_column("year") | |
numeric_feature_column._transform_feature(features) | |
bucketized_feature_column = tf.feature_column.bucketized_column( | |
source_column=numeric_feature_column, boundaries=[0, 1, 2] | |
) | |
bucketized_feature_column._transform_feature( | |
{numeric_feature_column: numeric_feature_column._transform_feature(features)} | |
) | |
categorical_column = tf.feature_column.categorical_column_with_vocabulary_list( | |
"number_str", [str(i) for i in range(100)] | |
) | |
print(categorical_column._transform_feature(features)) | |
keywords = tf.feature_column.categorical_column_with_hash_bucket("keywords", 10000) | |
print(keywords._transform_feature(features)) | |
indicator_column = tf.feature_column.indicator_column(categorical_column) | |
print( | |
indicator_column._transform_feature( | |
{categorical_column: categorical_column._transform_feature(features)} | |
) | |
) | |
latitude_buckets = list(np.linspace(33.641336, 33.887157, 99)) | |
la_fc = tf.feature_column.numeric_column("latitude") | |
latitude_fc = tf.feature_column.bucketized_column(la_fc, latitude_buckets) | |
# Do the same bucketization for longitude as done for latitude. | |
longitude_buckets = list(np.linspace(-84.558798, -84.287259, 99)) | |
lo_fc = tf.feature_column.numeric_column("longitude") | |
longitude_fc = tf.feature_column.bucketized_column(lo_fc, longitude_buckets) | |
# Create a feature cross of fc_longitude x fc_latitude. | |
fc_san_francisco_boxed = tf.feature_column.crossed_column( | |
keys=[latitude_fc, longitude_fc], hash_bucket_size=10000 | |
) | |
print( | |
fc_san_francisco_boxed._transform_feature( | |
{ | |
latitude_fc: latitude_fc._transform_feature( | |
{la_fc: la_fc._transform_feature(features)} | |
), | |
longitude_fc: longitude_fc._transform_feature( | |
{lo_fc: lo_fc._transform_feature(features)} | |
), | |
} | |
) | |
) | |
print(latitude_fc._transform_feature({la_fc: la_fc._transform_feature(features)})) | |
print(longitude_fc._transform_feature({lo_fc: lo_fc._transform_feature(features)})) | |
crossed_column = tf.feature_column.crossed_column(["keywords", "year"], 5000) | |
print(crossed_column._transform_feature(features)) | |
embedding_column = tf.feature_column.embedding_column( | |
categorical_column=categorical_column, | |
dimension=10, | |
) | |
print( | |
embedding_column._transform_feature( | |
{categorical_column: categorical_column._transform_feature(features)} | |
) | |
) | |
def print_output(feature_column, feature): | |
feature_layer = layers.DenseFeatures(feature_column) | |
print(feature_layer(feature).numpy()) | |
print_output(embedding_column, features) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment