Created
July 8, 2020 13:29
-
-
Save ntakouris/ebbbd79829229a034bec017644b32bb0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocessing_fn(inputs): | |
"""Preprocess input columns into transformed columns.""" | |
# Since we are modifying some features and leaving others unchanged, we | |
# start by setting `outputs` to a copy of `inputs. | |
outputs = inputs.copy() | |
# Scale numeric columns to have range [0, 1]. | |
for key in NUMERIC_FEATURE_KEYS: | |
outputs[key] = tft.scale_to_0_1(outputs[key]) | |
for key in OPTIONAL_NUMERIC_FEATURE_KEYS: | |
# This is a SparseTensor because it is optional. Here we fill in a default | |
# value when it is missing. | |
sparse = tf.sparse.SparseTensor(outputs[key].indices, outputs[key].values, | |
[outputs[key].dense_shape[0], 1]) | |
dense = tf.sparse.to_dense(sp_input=sparse, default_value=0.) | |
# Reshaping from a batch of vectors of size 1 to a batch to scalars. | |
dense = tf.squeeze(dense, axis=1) | |
outputs[key] = tft.scale_to_0_1(dense) | |
# For all categorical columns except the label column, we generate a | |
# vocabulary but do not modify the feature. This vocabulary is instead | |
# used in the trainer, by means of a feature column, to convert the feature | |
# from a string to an integer id. | |
for key in CATEGORICAL_FEATURE_KEYS: | |
tft.vocabulary(inputs[key], vocab_filename=key) | |
# For the label column we provide the mapping from string to index. | |
table_keys = ['>50K', '<=50K'] | |
initializer = tf.lookup.KeyValueTensorInitializer( | |
keys=table_keys, | |
values=tf.cast(tf.range(len(table_keys)), tf.int64), | |
key_dtype=tf.string, | |
value_dtype=tf.int64) | |
table = tf.lookup.StaticHashTable(initializer, default_value=-1) | |
outputs[LABEL_KEY] = table.lookup(outputs[LABEL_KEY]) | |
return outputs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment