Skip to content

Instantly share code, notes, and snippets.

@ntakouris
Created July 8, 2020 13:29
Show Gist options
  • Save ntakouris/ebbbd79829229a034bec017644b32bb0 to your computer and use it in GitHub Desktop.
Save ntakouris/ebbbd79829229a034bec017644b32bb0 to your computer and use it in GitHub Desktop.
def preprocessing_fn(inputs):
"""Preprocess input columns into transformed columns."""
# Since we are modifying some features and leaving others unchanged, we
# start by setting `outputs` to a copy of `inputs.
outputs = inputs.copy()
# Scale numeric columns to have range [0, 1].
for key in NUMERIC_FEATURE_KEYS:
outputs[key] = tft.scale_to_0_1(outputs[key])
for key in OPTIONAL_NUMERIC_FEATURE_KEYS:
# This is a SparseTensor because it is optional. Here we fill in a default
# value when it is missing.
sparse = tf.sparse.SparseTensor(outputs[key].indices, outputs[key].values,
[outputs[key].dense_shape[0], 1])
dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.)
# Reshaping from a batch of vectors of size 1 to a batch to scalars.
dense = tf.squeeze(dense, axis=1)
outputs[key] = tft.scale_to_0_1(dense)
# For all categorical columns except the label column, we generate a
# vocabulary but do not modify the feature. This vocabulary is instead
# used in the trainer, by means of a feature column, to convert the feature
# from a string to an integer id.
for key in CATEGORICAL_FEATURE_KEYS:
tft.vocabulary(inputs[key], vocab_filename=key)
# For the label column we provide the mapping from string to index.
table_keys = ['>50K', '<=50K']
initializer = tf.lookup.KeyValueTensorInitializer(
keys=table_keys,
values=tf.cast(tf.range(len(table_keys)), tf.int64),
key_dtype=tf.string,
value_dtype=tf.int64)
table = tf.lookup.StaticHashTable(initializer, default_value=-1)
outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY])
return outputs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment