Last active
June 7, 2022 10:52
-
-
Save deepanshu-yadav/09f02db1ca75bf07818dbfef1f142e1c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Class that handles scaling operation. | |
class Scaler: | |
def __init__(self, scaler): | |
self.scaler = scaler | |
# For backward compatibilty with earlier version of scikit learn | |
self.scaler.clip = False | |
def transform_using_scaler(self, data_x): | |
# Convert from tensor to numpy. | |
data_x = data_x.numpy() | |
# If we apply batch before scaling the input is | |
# (BATCH_SIZE, 1, NO_OF_FEATURES) | |
# The following will reduce the dimension to | |
# (BATCH_SIZE, NO_OF_FEATURES) | |
data_x = np.squeeze(data_x, axis=1) | |
# Finally tranform the data using the fitted scaler. | |
scaled_x = self.scaler.transform(data_x) | |
return scaled_x | |
# Initilaze the scaling object | |
scale_obj = Scaler(min_max_scaler_train) | |
# Get the header bytes to skip in every numpy file. | |
npy_file = training_files[0] | |
dtype = tf.float64 | |
header_offset = npy_header_offset(npy_file) | |
# We will use the header bytes in the code below. | |
dataset_train = tf.data.FixedLengthRecordDataset(training_files, | |
NO_OF_FEATURES * dtype.size, | |
header_bytes=header_offset) | |
# Convert the raw binary data to a tensor of dimension (1, NO_OF_FEATURES) | |
dataset_train = dataset_train.map(lambda s: tf.reshape( | |
tf.io.decode_raw(s, dtype), (1, NO_OF_FEATURES))) | |
# Make a batch of these tensors. | |
dataset_train = dataset_train.batch(BATCH_SIZE) | |
# The result will be a tensor of of dimension (BATCH_SIZE, 1, NO_OF_FEATURES) | |
# Now there is no function for min max scaling in tensorflow so we need | |
# to wrap our function into | |
# py_function. Notice the input is a array and output is also an array. | |
dataset_train = dataset_train.map(lambda x: tf.py_function( | |
scale_obj.transform_using_scaler, | |
[x], | |
[tf.float32])) | |
# Time for caching. | |
dataset_train = dataset_train.cache() | |
# For autoencoder there is no y (label). | |
# So will make a tuple of only X (input). | |
# Notice this is a memory consuming operation. | |
# Hence caching is not applied here. Rather it is applied before it. | |
dataset_train = dataset_train.map(lambda x: (x, x)) | |
# Now apply prefetch to ensure the next batch is already available when | |
# the processing of this batch is | |
# over. | |
dataset_train = dataset_train.prefetch(1) | |
# Repeat the dataset to prevent loss of samples in | |
# training because we may exhaust all the data | |
# after one epoch. | |
dataset_train = dataset_train.repeat() | |
# Repeat the whole process for validation. | |
npy_file = validation_files[0] | |
dtype = tf.float64 | |
header_offset = npy_header_offset(npy_file) | |
dataset_valid = tf.data.FixedLengthRecordDataset(validation_files, | |
NO_OF_FEATURES * dtype.size, | |
header_bytes=header_offset) | |
dataset_valid = dataset_valid.map(lambda s: tf.reshape(tf.io.decode_raw(s, dtype), (1, NO_OF_FEATURES))) | |
dataset_valid = dataset_valid.batch(BATCH_SIZE) | |
dataset_valid = dataset_valid.map(lambda x: tf.py_function(scale_obj.transform_using_scaler, | |
[x], | |
[tf.float32])) | |
dataset_valid = dataset_valid.cache() | |
dataset_valid = dataset_valid.map(lambda x: (x, x)) | |
dataset_valid = dataset_valid.prefetch(1) | |
dataset_valid = dataset_valid.repeat() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment