Created
June 29, 2019 04:46
-
-
Save jackyyeh5111/91b5b53a1d28479fc50f22990c2f2a7a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2017 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Preprocess images and bounding boxes for "classification" not detection. | |
We perform two sets of operations in preprocessing stage: | |
(a) operations that are applied to both training and testing data, | |
(b) operations that are applied only to training data for the purpose of | |
data augmentation. | |
A preprocessing function receives a set of inputs, | |
e.g. an image a filename and bounding boxes, | |
performs an operation on them, and returns them. | |
Some examples are: randomly cropping the image, randomly mirroring the image, | |
randomly changing the brightness, contrast, hue and | |
randomly jittering the bounding boxes. | |
The preprocess function receives an image a filename and bboxes tesnors. | |
The image is a rank 4 tensor: [1, height, width, channels] with | |
dtype=tf.float32. The groundtruth_boxes is a rank 2 tensor: [N, 4] where | |
in each row there is a box with [ymin xmin ymax xmax]. | |
Boxes are in normalized coordinates meaning | |
their coordinate values range in [0, 1] | |
Important Note: In tensor_dict, images is a rank 4 tensor, but preprocessing | |
functions receive a rank 3 tensor for processing the image. Thus, inside the | |
preprocess function we squeeze the image to become a rank 3 tensor and then | |
we pass it to the functions. At the end of the preprocess we expand the image | |
back to rank 4. | |
""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
import functools | |
import sys | |
import tensorflow as tf | |
from tensorflow.python.ops import control_flow_ops | |
import tensorflow as tf | |
slim = tf.contrib.slim | |
_R_MEAN = 123.68 | |
_G_MEAN = 116.78 | |
_B_MEAN = 103.94 | |
_RESIZE_SIDE_MIN = 256 | |
_RESIZE_SIDE_MAX = 512 | |
# TODO(mttang): This method is needed because the current | |
# tf.image.rgb_to_grayscale method does not support quantization. Replace with | |
# tf.image.rgb_to_grayscale after quantization support is added. | |
def _rgb_to_grayscale(images, name=None): | |
"""Converts one or more images from RGB to Grayscale. | |
Outputs a tensor of the same `DType` and rank as `images`. The size of the | |
last dimension of the output is 1, containing the Grayscale value of the | |
pixels. | |
Args: | |
images: The RGB tensor to convert. Last dimension must have size 3 and | |
should contain RGB values. | |
name: A name for the operation (optional). | |
Returns: | |
The converted grayscale image(s). | |
""" | |
with tf.name_scope(name, 'rgb_to_grayscale', [images]) as name: | |
images = tf.convert_to_tensor(images, name='images') | |
# Remember original dtype to so we can convert back if needed | |
orig_dtype = images.dtype | |
flt_image = tf.image.convert_image_dtype(images, tf.float32) | |
# Reference for converting between RGB and grayscale. | |
# https://en.wikipedia.org/wiki/Luma_%28video%29 | |
rgb_weights = [0.2989, 0.5870, 0.1140] | |
rank_1 = tf.expand_dims(tf.rank(images) - 1, 0) | |
gray_float = tf.reduce_sum( | |
flt_image * rgb_weights, rank_1, keep_dims=True) | |
gray_float.set_shape(images.get_shape()[:-1].concatenate([1])) | |
return tf.image.convert_image_dtype(gray_float, orig_dtype, name=name) | |
def normalize_image(image, original_minval, original_maxval, target_minval, | |
target_maxval): | |
"""Normalizes pixel values in the image. | |
Moves the pixel values from the current [original_minval, original_maxval] | |
range to a the [target_minval, target_maxval] range. | |
Args: | |
image: rank 3 float32 tensor containing 1 | |
image -> [height, width, channels]. | |
original_minval: current image minimum value. | |
original_maxval: current image maximum value. | |
target_minval: target image minimum value. | |
target_maxval: target image maximum value. | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
with tf.name_scope('NormalizeImage', values=[image]): | |
original_minval = float(original_minval) | |
original_maxval = float(original_maxval) | |
target_minval = float(target_minval) | |
target_maxval = float(target_maxval) | |
image = tf.cast(image, dtype=tf.float32) | |
image = tf.subtract(image, original_minval) | |
image = tf.multiply(image, (target_maxval - target_minval) / | |
(original_maxval - original_minval)) | |
image = tf.add(image, target_minval) | |
return image | |
def _mean_image_subtraction(image, means): | |
"""Subtracts the given means from each image channel. | |
For example: | |
means = [123.68, 116.779, 103.939] | |
image = _mean_image_subtraction(image, means) | |
Note that the rank of `image` must be known. | |
Args: | |
image: a tensor of size [height, width, C]. | |
means: a C-vector of values to subtract from each channel. | |
Returns: | |
the centered image. | |
Raises: | |
ValueError: If the rank of `image` is unknown, if `image` has a rank other | |
than three or if the number of channels in `image` doesn't match the | |
number of values in `means`. | |
""" | |
if image.get_shape().ndims != 3: | |
raise ValueError('Input must be of size [height, width, C>0]') | |
num_channels = image.get_shape().as_list()[-1] | |
if len(means) != num_channels: | |
raise ValueError('len(means) must match the number of channels') | |
channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image) | |
for i in range(num_channels): | |
channels[i] -= means[i] | |
return tf.concat(axis=2, values=channels) | |
def _mean_images_subtraction(images, means): | |
"""Subtracts the given means from each image channel. | |
For example: | |
means = [123.68, 116.779, 103.939] | |
image = _mean_images_subtraction(image, means) | |
Note that the rank of `image` must be known. | |
Args: | |
image: a tensor of size [batch, height, width, C]. | |
means: a C-vector of values to subtract from each channel. | |
Returns: | |
the centered image. | |
Raises: | |
ValueError: If the rank of `image` is unknown, if `image` has a rank other | |
than three or if the number of channels in `image` doesn't match the | |
number of values in `means`. | |
""" | |
if images.get_shape().ndims != 4: | |
raise ValueError('Input must be of size [batch, height, width, C>0]') | |
num_channels = images.get_shape().as_list()[-1] | |
if len(means) != num_channels: | |
raise ValueError('len(means) must match the number of channels') | |
channels = tf.split(axis=3, num_or_size_splits=num_channels, value=images) | |
for i in range(num_channels): | |
channels[i] -= means[i] | |
return tf.concat(axis=3, values=channels) | |
def random_horizontal_flip(image, seed=None): | |
"""Randomly flips the image and detections horizontally. | |
The probability of flipping the image is 50%. | |
Args: | |
image: rank 3 float32 tensor with shape [height, width, channels]. | |
seed: random seed | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
def _flip_image(image): | |
# flip image | |
image_flipped = tf.image.flip_left_right(image) | |
return image_flipped | |
# random variable defining whether to do flip or not | |
do_a_flip_random = tf.random_uniform([], seed=seed) | |
do_a_flip_random = tf.greater(do_a_flip_random, 0.5) | |
# flip image | |
image = tf.cond( | |
do_a_flip_random, lambda: _flip_image(image), lambda: image) | |
return image | |
def random_vertical_flip(image, seed=None): | |
"""Randomly flips the image and detections vertically. | |
The probability of flipping the image is 50%. | |
Args: | |
image: rank 3 float32 tensor with shape [height, width, channels]. | |
seed: random seed | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
def _flip_image(image): | |
# flip image | |
image_flipped = tf.image.flip_up_down(image) | |
return image_flipped | |
do_a_flip_random = tf.random_uniform([], seed=seed) | |
do_a_flip_random = tf.greater(do_a_flip_random, 0.5) | |
image = tf.cond(do_a_flip_random, lambda: _flip_image(image), | |
lambda: image) | |
return | |
def random_rotation90(image, seed=None): | |
"""Randomly rotates the image and detections 90 degrees counter-clockwise. | |
The probability of rotating the image is 50%. This can be combined with | |
random_horizontal_flip and random_vertical_flip to produce an output with a | |
uniform distribution of the eight possible 90 degree rotation / reflection | |
combinations. | |
Args: | |
image: rank 3 float32 tensor with shape [height, width, channels]. | |
seed: random seed | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
def _rot90_image(image): | |
# flip image | |
image_rotated = tf.image.rot90(image) | |
return image_rotated | |
# random variable defining whether to rotate by 90 degrees or not | |
do_a_rot90_random = tf.random_uniform([],seed=seed) | |
do_a_rot90_random = tf.greater(do_a_rot90_random, 0.5) | |
# flip image | |
image = tf.cond(do_a_rot90_random, lambda: _rot90_image(image), | |
lambda: image) | |
return image | |
def random_pixel_value_scale(image, | |
minval=0.9, | |
maxval=1.1, | |
seed=None): | |
"""Scales each value in the pixels of the image. | |
This function scales each pixel independent of the other ones. | |
For each value in image tensor, draws a random number between | |
minval and maxval and multiples the values with them. | |
Args: | |
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] | |
with pixel values varying between [0, 255]. | |
minval: lower ratio of scaling pixel values. | |
maxval: upper ratio of scaling pixel values. | |
seed: random seed. | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
with tf.name_scope('RandomPixelValueScale', values=[image]): | |
color_coef = tf.random_uniform(tf.shape(image), | |
minval=minval, maxval=maxval, | |
dtype=tf.float32, seed=seed) | |
image = tf.multiply(image, color_coef) | |
image = tf.clip_by_value(image, 0.0, 255.0) | |
return image | |
def _augment_only_rgb_channels(image, augment_function): | |
"""Augments only the RGB slice of an image with additional channels.""" | |
rgb_slice = image[:, :, :3] | |
augmented_rgb_slice = augment_function(rgb_slice) | |
image = tf.concat([augmented_rgb_slice, image[:, :, 3:]], -1) | |
return image | |
def random_adjust_brightness(image, | |
max_delta=0.1, | |
seed=None): | |
"""Randomly adjusts brightness. | |
Makes sure the output image is still between 0 and 255. | |
Args: | |
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] | |
with pixel values varying between [0, 255]. | |
max_delta: how much to change the brightness. A value between [0, 1). | |
seed: random seed. | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
with tf.name_scope('RandomAdjustBrightness', values=[image]): | |
delta = tf.random_uniform([],-max_delta, max_delta, seed=seed) | |
def _adjust_brightness(image): | |
image = tf.image.adjust_brightness(image / 255, delta) * 255 | |
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) | |
return image | |
image = _augment_only_rgb_channels(image, _adjust_brightness) | |
return image | |
def random_adjust_contrast(image, | |
min_delta=0.9, | |
max_delta=1.1, | |
seed=None): | |
"""Randomly adjusts contrast. | |
Makes sure the output image is still between 0 and 255. | |
Args: | |
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] | |
with pixel values varying between [0, 255]. | |
min_delta: see max_delta. | |
max_delta: how much to change the contrast. Contrast will change with a | |
value between min_delta and max_delta. This value will be | |
multiplied to the current contrast of the image. | |
seed: random seed. | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
with tf.name_scope('RandomAdjustContrast', values=[image]): | |
contrast_factor = tf.random_uniform([],min_delta, max_delta, seed=seed) | |
def _adjust_contrast(image): | |
image = tf.image.adjust_contrast(image / 255, contrast_factor) * 255 | |
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) | |
return image | |
image = _augment_only_rgb_channels(image, _adjust_contrast) | |
return image | |
def random_adjust_hue(image, | |
max_delta=0.02, | |
seed=None): | |
"""Randomly adjusts hue. | |
Makes sure the output image is still between 0 and 255. | |
Args: | |
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] | |
with pixel values varying between [0, 255]. | |
max_delta: change hue randomly with a value between 0 and max_delta. | |
seed: random seed. | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
with tf.name_scope('RandomAdjustHue', values=[image]): | |
delta = tf.random_uniform([], -max_delta, max_delta, seed=seed) | |
def _adjust_hue(image): | |
image = tf.image.adjust_hue(image / 255, delta) * 255 | |
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) | |
return image | |
image = _augment_only_rgb_channels(image, _adjust_hue) | |
return image | |
def random_adjust_saturation(image, | |
min_delta=0.8, | |
max_delta=1.25, | |
seed=None): | |
"""Randomly adjusts saturation. | |
Makes sure the output image is still between 0 and 255. | |
Args: | |
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] | |
with pixel values varying between [0, 255]. | |
min_delta: see max_delta. | |
max_delta: how much to change the saturation. Saturation will change with a | |
value between min_delta and max_delta. This value will be | |
multiplied to the current saturation of the image. | |
seed: random seed. | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
with tf.name_scope('RandomAdjustSaturation', values=[image]): | |
saturation_factor = tf.random_uniform([],min_delta, max_delta, seed=seed) | |
def _adjust_saturation(image): | |
image = tf.image.adjust_saturation(image / 255, saturation_factor) * 255 | |
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) | |
return image | |
image = _augment_only_rgb_channels(image, _adjust_saturation) | |
return image | |
def random_add_PCA_noise(image, | |
max_delta=0.1, | |
seed=None): | |
"""Randomly adjusts brightness. | |
Makes sure the output image is still between 0 and 255. | |
Args: | |
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] | |
with pixel values varying between [0, 255]. | |
max_delta: how much to change the brightness. A value between [0, 1). | |
seed: random seed. | |
Returns: | |
image: image which is the same shape as input image. | |
""" | |
with tf.name_scope('RandomAdjustBrightness', values=[image]): | |
delta = tf.random_uniform([],-max_delta, max_delta, seed=seed) | |
def _adjust_brightness(image): | |
image = tf.image.adjust_brightness(image / 255, delta) * 255 | |
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) | |
return image | |
image = _augment_only_rgb_channels(image, _adjust_brightness) | |
return image | |
def random_distort_color(image, color_ordering=0): | |
"""Randomly distorts color. | |
Randomly distorts color using a combination of brightness, hue, contrast and | |
saturation changes. Makes sure the output image is still between 0 and 255. | |
Args: | |
image: rank 3 float32 tensor contains 1 image -> [height, width, channels] | |
with pixel values varying between [0, 255]. | |
color_ordering: Python int, a type of distortion (valid values: 0, 1, 2). | |
Returns: | |
image: image which is the same shape as input image. | |
Raises: | |
ValueError: if color_ordering is not in {0, 1, 2}. | |
""" | |
with tf.name_scope('RandomDistortColor', values=[image]): | |
if color_ordering == 0: | |
image = random_adjust_brightness(image, max_delta=32. / 255.) | |
image = random_adjust_saturation(image, min_delta=0.5, max_delta=1.5) | |
image = random_adjust_hue(image, max_delta=0.2) | |
image = random_adjust_contrast(image, min_delta=0.5, max_delta=1.5) | |
elif color_ordering == 1: | |
image = random_adjust_brightness(image, max_delta=32. / 255.) | |
image = random_adjust_contrast(image, min_delta=0.5, max_delta=1.5) | |
image = random_adjust_saturation(image, min_delta=0.5, max_delta=1.5) | |
image = random_adjust_hue(image, max_delta=0.2) | |
elif color_ordering == 2: | |
image = random_adjust_brightness(image, max_delta=0.1) | |
image = random_adjust_contrast(image, min_delta=0.8, max_delta=1.1) | |
elif color_ordering == 3: | |
image = random_adjust_brightness(image, max_delta=0.1) | |
elif color_ordering == 4: | |
image = random_adjust_brightness(image, max_delta=0.1) | |
image = random_adjust_contrast(image, min_delta=0.8, max_delta=1.1) | |
image = random_add_PCA_noise(image, max_delta=0.1) | |
else: | |
raise ValueError('color_ordering must be in {0, 1, 2}') | |
return image | |
def image_to_float(image): | |
"""Used in Faster R-CNN. Casts image pixel values to float. | |
Args: | |
image: input image which might be in tf.uint8 or sth else format | |
Returns: | |
image: image in tf.float32 format. | |
""" | |
with tf.name_scope('ImageToFloat', values=[image]): | |
image = tf.cast(image, dtype=tf.float32) | |
return image | |
def _get_image_info(image): | |
"""Returns the height, width and number of channels in the image.""" | |
image_height = tf.shape(image)[0] | |
image_width = tf.shape(image)[1] | |
num_channels = tf.shape(image)[2] | |
return (image_height, image_width, num_channels) | |
def _largest_size_at_least(height, width, smallest_side): | |
"""Computes new shape with the smallest side equal to `smallest_side`. | |
Computes new shape with the smallest side equal to `smallest_side` while | |
preserving the original aspect ratio. | |
Args: | |
height: an int32 scalar tensor indicating the current height. | |
width: an int32 scalar tensor indicating the current width. | |
smallest_side: A python integer or scalar `Tensor` indicating the size of | |
the smallest side after resize. | |
Returns: | |
new_height: an int32 scalar tensor indicating the new height. | |
new_width: and int32 scalar tensor indicating the new width. | |
""" | |
smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32) | |
height = tf.to_float(height) | |
width = tf.to_float(width) | |
smallest_side = tf.to_float(smallest_side) | |
scale = tf.cond(tf.greater(height, width), | |
lambda: smallest_side / height, | |
lambda: smallest_side / width) | |
new_height = tf.to_int32(tf.rint(height * scale)) | |
new_width = tf.to_int32(tf.rint(width * scale)) | |
return new_height, new_width | |
def _smallest_size_at_least(height, width, smallest_side): | |
"""Computes new shape with the smallest side equal to `smallest_side`. | |
Computes new shape with the smallest side equal to `smallest_side` while | |
preserving the original aspect ratio. | |
Args: | |
height: an int32 scalar tensor indicating the current height. | |
width: an int32 scalar tensor indicating the current width. | |
smallest_side: A python integer or scalar `Tensor` indicating the size of | |
the smallest side after resize. | |
Returns: | |
new_height: an int32 scalar tensor indicating the new height. | |
new_width: and int32 scalar tensor indicating the new width. | |
""" | |
smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32) | |
height = tf.to_float(height) | |
width = tf.to_float(width) | |
smallest_side = tf.to_float(smallest_side) | |
scale = tf.cond(tf.greater(height, width), | |
lambda: smallest_side / width, | |
lambda: smallest_side / height) | |
new_height = tf.to_int32(tf.rint(height * scale)) | |
new_width = tf.to_int32(tf.rint(width * scale)) | |
return new_height, new_width | |
def _small_aspect_preserving_resize(image, smallest_side): | |
"""Resize images preserving the original aspect ratio. This will | |
resize the image to longer side equlas to smallest side. thus it is | |
called small_aspect_preserving_resize | |
Args: | |
image: A 3-D image `Tensor`. | |
smallest_side: A python integer or scalar `Tensor` indicating the size of | |
the smallest side after resize. | |
Returns: | |
resized_image: A 3-D tensor containing the resized image. | |
""" | |
smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32) | |
shape = tf.shape(image) | |
height = shape[0] | |
width = shape[1] | |
new_height, new_width = _largest_size_at_least(height, width, smallest_side) | |
image = tf.expand_dims(image, 0) | |
resized_image = tf.image.resize_bilinear(image, [new_height, new_width], | |
align_corners=False) | |
resized_image = tf.squeeze(resized_image) | |
resized_image.set_shape([None, None, 3]) | |
return resized_image | |
def _aspect_preserving_resize(image, smallest_side): | |
"""Resize images preserving the original aspect ratio. | |
Args: | |
image: A 3-D image `Tensor`. | |
smallest_side: A python integer or scalar `Tensor` indicating the size of | |
the smallest side after resize. | |
Returns: | |
resized_image: A 3-D tensor containing the resized image. | |
""" | |
smallest_side = tf.convert_to_tensor(smallest_side, dtype=tf.int32) | |
shape = tf.shape(image) | |
height = shape[0] | |
width = shape[1] | |
new_height, new_width = _smallest_size_at_least(height, width, smallest_side) | |
image = tf.expand_dims(image, 0) | |
resized_image = tf.image.resize_bilinear(image, [new_height, new_width], | |
align_corners=False) | |
resized_image = tf.squeeze(resized_image) | |
resized_image.set_shape([None, None, 3]) | |
return resized_image | |
def subtract_channel_mean(image, means=None): | |
"""Normalizes an image by subtracting a mean from each channel. | |
Args: | |
image: A 3D tensor of shape [height, width, channels] | |
means: float list containing a mean for each channel | |
Returns: | |
normalized_images: a tensor of shape [height, width, channels] | |
Raises: | |
ValueError: if images is not a 4D tensor or if the number of means is not | |
equal to the number of channels. | |
""" | |
with tf.name_scope('SubtractChannelMean', values=[image, means]): | |
if len(image.get_shape()) != 3: | |
raise ValueError('Input must be of size [height, width, channels]') | |
if len(means) != image.get_shape()[-1]: | |
raise ValueError('len(means) must match the number of channels') | |
return image - [[means]] | |
def one_hot_encoding(labels, num_classes=None): | |
"""One-hot encodes the multiclass labels. | |
Example usage: | |
labels = tf.constant([1, 4], dtype=tf.int32) | |
one_hot = OneHotEncoding(labels, num_classes=5) | |
one_hot.eval() # evaluates to [0, 1, 0, 0, 1] | |
Args: | |
labels: A tensor of shape [None] corresponding to the labels. | |
num_classes: Number of classes in the dataset. | |
Returns: | |
onehot_labels: a tensor of shape [num_classes] corresponding to the one hot | |
encoding of the labels. | |
Raises: | |
ValueError: if num_classes is not specified. | |
""" | |
with tf.name_scope('OneHotEncoding', values=[labels]): | |
if num_classes is None: | |
raise ValueError('num_classes must be specified') | |
# create all zeors vector by one hot. In case labels is empty! | |
labels = tf.concat([tf.constant([-1], dtype=tf.int64), labels], axis = 0) | |
labels = tf.one_hot(labels, num_classes, 1, 0) | |
return tf.reduce_max(labels, 0) #[num_classes] | |
def rgb_to_gray(image): | |
"""Converts a 3 channel RGB image to a 1 channel grayscale image. | |
Args: | |
image: Rank 3 float32 tensor containing 1 image -> [height, width, 3] | |
with pixel values varying between [0, 1]. | |
Returns: | |
image: A single channel grayscale image -> [image, height, 1]. | |
""" | |
return _rgb_to_grayscale(image) | |
def convert_class_logits_to_softmax(multiclass_scores, temperature=1.0): | |
"""Converts multiclass logits to softmax scores after applying temperature. | |
Args: | |
multiclass_scores: float32 tensor of shape | |
[num_instances, num_classes] representing the score for each box for each | |
class. | |
temperature: Scale factor to use prior to applying softmax. Larger | |
temperatures give more uniform distruibutions after softmax. | |
Returns: | |
multiclass_scores: float32 tensor of shape | |
[num_instances, num_classes] with scaling and softmax applied. | |
""" | |
# Multiclass scores must be stored as logits. Apply temp and softmax. | |
multiclass_scores_scaled = tf.divide( | |
multiclass_scores, temperature, name='scale_logits') | |
multiclass_scores = tf.nn.softmax(multiclass_scores_scaled, name='softmax') | |
return multiclass_scores | |
def center_pad_to_square(image): | |
"""Central Padding the image to square. | |
Args: | |
image: an image of shape [height, width, channels]. | |
Returns: | |
image of shape [new_length, new_length, channels] | |
""" | |
rank_assertions = [] | |
image_rank = tf.rank(image) | |
rank_assert = tf.Assert( | |
tf.equal(image_rank, 3), | |
['Wrong rank for tensor %s [expected] [actual]', | |
image.name, 3, image_rank]) | |
rank_assertions.append(rank_assert) | |
with tf.control_dependencies(rank_assertions): | |
image_shape = tf.shape(image) | |
image_height = image_shape[0] | |
image_width = image_shape[1] | |
target_length = tf.math.maximum(image_height, image_width) | |
image = tf.image.resize_image_with_crop_or_pad( | |
image, target_length, target_length) | |
new_shape = tf.stack([target_length, target_length, image_shape[2]]) | |
return tf.reshape(image,new_shape) | |
def _is_bbox_not_in_crop(bboxes, image_height, image_width, | |
offset_height, offset_width, | |
crop_height, crop_width): | |
""" | |
bboxes: rank 2 float32 tensor containing the bounding boxes with shape | |
[num_instances, 4]. | |
Boxes are in normalized form meaning their coordinates vary | |
between [0, 1]. | |
Each row is in the form of [ymin, xmin, ymax, xmax]. | |
offset_height: a scalar tensor indicating the height offset. | |
offset_width: a scalar tensor indicating the width offset. | |
crop_height: the height of the cropped image. | |
crop_width: the width of the cropped image. | |
""" | |
# if any bbox: xmax < offset_width or xmin > offset_width+crop_width or | |
# ymax < offset_height or ymin > offset_height+crop_height | |
image_width = tf.cast(image_width, tf.float32) | |
image_height = tf.cast(image_height, tf.float32) | |
offset_height = tf.cast(offset_height, tf.float32) | |
offset_width = tf.cast(offset_width ,tf.float32) | |
crop_height = tf.cast(crop_height, tf.float32) | |
crop_width = tf.cast(crop_width, tf.float32) | |
ymax = offset_height + crop_height | |
xmax = offset_width + crop_width | |
mask1 = tf.math.logical_or(bboxes[:,0]*image_height > ymax, | |
bboxes[:,1]*image_width > xmax) | |
mask2 = tf.math.logical_or(bboxes[:,2]*image_height < offset_height, | |
bboxes[:,3]*image_width < offset_width) | |
mask = tf.math.logical_or(mask1, mask2) | |
mask = tf.reshape(mask,[-1]) | |
return mask | |
def _crop(image, bboxes, labels, offset_height, offset_width, | |
crop_height, crop_width): | |
"""Crops the given image using the provided offsets and sizes. | |
The label is discarded if the bbox in not in the cropped image. | |
Note that the method doesn't assume we know the input image size but it does | |
assume we know the input image rank. | |
Args: | |
image: an image of shape [height, width, channels]. | |
bboxes: rank 2 float32 tensor containing the bounding boxes with shape | |
[num_instances, 4]. | |
Boxes are in normalized form meaning their coordinates vary | |
between [0, 1]. | |
Each row is in the form of [ymin, xmin, ymax, xmax]. | |
offset_height: a scalar tensor indicating the height offset. | |
offset_width: a scalar tensor indicating the width offset. | |
crop_height: the height of the cropped image. | |
crop_width: the width of the cropped image. | |
Returns: | |
the cropped (and resized) image and labels. | |
Raises: | |
InvalidArgumentError: if the rank is not 3 or if the image dimensions are | |
less than the crop size. | |
""" | |
print("YES I DO CROPPPPPPPPPPPPPPPPPPPPPPPPPPPPP") | |
original_shape = tf.shape(image) | |
rank_assertion = tf.Assert( | |
tf.equal(tf.rank(image), 3), | |
['Rank of image must be equal to 3.']) | |
with tf.control_dependencies([rank_assertion]): | |
cropped_shape = tf.stack([crop_height, crop_width, original_shape[2]]) | |
size_assertion = tf.Assert( | |
tf.logical_and( | |
tf.greater_equal(original_shape[0], crop_height), | |
tf.greater_equal(original_shape[1], crop_width)), | |
['Crop size greater than the image size.']) | |
offsets = tf.to_int32(tf.stack([offset_height, offset_width, 0])) | |
dimension_assertion = tf.Assert( | |
tf.equal(tf.shape(bboxes)[0], tf.shape(labels)[0]), | |
['Dimension 0 of bboxes and labels should not be different.']) | |
# Use tf.slice instead of crop_to_bounding box as it accepts tensors to | |
# define the crop size. | |
with tf.control_dependencies([size_assertion, dimension_assertion]): | |
image = tf.slice(image, offsets, cropped_shape) | |
# bbox clipping and filter out out of window bbox | |
bbox_pruner = functools.partial( | |
_is_bbox_not_in_crop, bboxes, original_shape[0], original_shape[1], | |
offset_height, offset_width, crop_height, crop_width) | |
# in case there is no defect bbox (a perfect sample) | |
#mask = tf.cond(tf.equal(tf.size(bboxes),0), | |
# lambda: tf.constant([],dtype=tf.bool), | |
# lambda: bbox_pruner()) | |
#temp = tf.math.logical_not(mask) | |
labels = tf.cond(tf.equal(tf.size(bboxes),0), | |
lambda: labels, | |
lambda: tf.boolean_mask(labels, tf.math.logical_not(bbox_pruner()))) #tf.math.logical_not(bbox_pruner())) | |
# take only bbox that is not out of crop | |
#labels = tf.boolean_mask(labels, tf.math.logical_not(mask)) | |
#bboxes = tf.boolean_mask(bboxes, tf.math.logical_not(mask)) | |
return tf.reshape(image, cropped_shape), labels | |
def _random_crop(image, bboxes, labels, min_crop_ratio=0.999999): | |
#min_crop_height, min_crop_width, | |
#max_crop_height, max_crop_height): | |
"""Crops the given list of images. | |
The function applies the same crop to each image in the list. This can be | |
effectively applied when there are multiple image inputs of the same | |
dimension such as: | |
image, depths, normals = _random_crop([image, depths, normals], 120, 150) | |
Args: | |
image: an image tensor. | |
min_crop_ratio: the ratio of minimum_crop_side_length/side_length. | |
Returns: | |
The cropped images, and labels (the lable of bbox which is not in | |
cropped image is pruned out) | |
Raises: | |
ValueError: if there are multiple image inputs provided with different size | |
or the images are smaller than the crop dimensions. | |
""" | |
# Compute the rank assertions. | |
with tf.name_scope('RandomCrop', values=[image, bboxes,labels]): | |
rank_assertions = [] | |
image_rank = tf.rank(image) | |
rank_assert = tf.Assert( | |
tf.equal(image_rank, 3), | |
['Wrong rank for tensor %s [expected] [actual]', | |
image.name, 3, image_rank]) | |
rank_assertions.append(rank_assert) | |
with tf.control_dependencies([rank_assertions[0]]): | |
image_shape = tf.shape(image) | |
image_height = image_shape[0] | |
image_width = image_shape[1] | |
#crop_size_assert = tf.Assert( | |
# tf.logical_and( | |
# tf.greater_equal(image_height, max_crop_height), | |
# tf.greater_equal(image_width, max_crop_width)), | |
# ['Crop size greater than the image size.']) | |
asserts = [rank_assertions[0]] | |
#asserts = [rank_assertions[0], crop_size_assert] | |
# Create a random bounding box. | |
# | |
# Use tf.random_uniform and not numpy.random.rand as doing the former would | |
# generate random numbers at graph eval time, unlike the latter which | |
# generates random numbers at graph definition time. | |
min_crop_ratio = tf.constant(min_crop_ratio) | |
crop_height = tf.random_uniform([], | |
minval= tf.cast( | |
tf.cast(image_height,dtype=tf.float32)*min_crop_ratio,dtype=tf.int32), | |
maxval= image_height, dtype=tf.int32) | |
crop_width = tf.random_uniform([], | |
minval= tf.cast( | |
tf.cast(image_width,dtype=tf.float32)*min_crop_ratio,dtype=tf.int32), | |
maxval= image_width, dtype=tf.int32) | |
with tf.control_dependencies(asserts): | |
max_offset_height = tf.reshape(image_height - crop_height + 1, []) | |
with tf.control_dependencies(asserts): | |
max_offset_width = tf.reshape(image_width - crop_width + 1, []) | |
offset_height = tf.random_uniform( | |
[], maxval=max_offset_height, dtype=tf.int32) | |
offset_width = tf.random_uniform( | |
[], maxval=max_offset_width, dtype=tf.int32) | |
return _crop(image, bboxes, labels, offset_height, offset_width, | |
crop_height, crop_width) | |
def _get_camera_int_id(filename): | |
"""Convert camera_id into tf.float32 tensor type. | |
Args: | |
filename: tf.string tensor with shape(), which | |
represents filename. | |
Returns: | |
camera_int_id: tf.int32 tensor with shape (). | |
""" | |
def _to_tensor(id): | |
return tf.constant(id, dtype= tf.int32, name= 'camera_mask_id') | |
tokens = tf.strings.split(tf.expand_dims(filename, 0), sep='--') | |
camera_id = tokens.values[3] | |
camera_int_id = tf.case( | |
{ | |
tf.equal(camera_id, b'Camera0'): lambda: _to_tensor(0), | |
tf.equal(camera_id, b'Camera1'): lambda: _to_tensor(1), | |
tf.equal(camera_id, b'Camera2'): lambda: _to_tensor(2), | |
tf.equal(camera_id, b'Camera3'): lambda: _to_tensor(3), | |
tf.equal(camera_id, b'Camera4'): lambda: _to_tensor(3), | |
tf.equal(camera_id, b'Camera5'): lambda: _to_tensor(3), | |
tf.equal(camera_id, b'Camera6'): lambda: _to_tensor(3), | |
tf.equal(camera_id, b'Camera7'): lambda: _to_tensor(4), | |
tf.equal(camera_id, b'Camera8'): lambda: _to_tensor(4), | |
tf.equal(camera_id, b'Camera9'): lambda: _to_tensor(4), | |
tf.equal(camera_id, b'Camera10'): lambda: _to_tensor(4) | |
}, | |
default= None, | |
exclusive=True | |
) | |
return camera_int_id | |
def _mix_up_augmentation(images, labels, filenames, seed=None): | |
""" | |
Args: | |
images: rank 4 float32 tensor contains | |
N image -> [N, height, width, 3]. | |
with pixel values varying between [-128, 128] | |
labels: rank 2 float32 tensor containing | |
the (multilabel) onehot labels -> [N, num_classes]. | |
filenames: rank1 string tensor contains N string. | |
Return: | |
the preprocessed images and labels | |
""" | |
def _mix_up(images, labels, seed=None): | |
num_img = tf.shape(images)[0] | |
do_mix_random = tf.random_uniform([num_img],seed=seed) | |
#do_mix_random = tf.greater(do_a_rot90_random, 0.5) | |
rand_idx = tf.reshape(tf.random_shuffle(tf.range(num_img)),[num_img,1]) | |
shuffled_imgs = tf.gather_nd(images, rand_idx) | |
shuffled_labs = tf.gather_nd(labels, rand_idx) | |
mixed_imgs = 0.5*images + 0.5*shuffled_imgs | |
mixed_labs = 0.5*labels + 0.5*shuffled_labs | |
images = tf.where(tf.greater(do_mix_random,0.5), mixed_imgs, images) | |
labels = tf.where(tf.greater(do_mix_random,0.5), mixed_labs, labels) | |
return images, labels | |
camera_ids = tf.map_fn(_get_camera_int_id, filenames, | |
dtype=tf.int32, back_prop=False) | |
tobe_mixed_idxs = tf.equal(camera_ids, 1) | |
tobe_mixed_imgs = tf.boolean_mask(images, tobe_mixed_idxs) | |
tobe_mixed_labs = tf.boolean_mask(labels, tobe_mixed_idxs) | |
notbe_mixed_imgs = tf.boolean_mask(images, tf.math.logical_not(tobe_mixed_idxs)) | |
notbe_mixed_labs = tf.boolean_mask(labels, tf.math.logical_not(tobe_mixed_idxs)) | |
mixed_images, mixed_labels = tf.cond( | |
tf.greater(tf.shape(tobe_mixed_imgs)[0], tf.constant(1, dtype= tf.int32)), | |
lambda: _mix_up(tobe_mixed_imgs, tobe_mixed_labs), | |
lambda: (tobe_mixed_imgs, tobe_mixed_labs) | |
) | |
images = tf.reshape( | |
tf.concat([mixed_images, notbe_mixed_imgs], axis=0), tf.shape(images)) | |
labels = tf.reshape( | |
tf.concat([mixed_labels, notbe_mixed_labs], axis=0), tf.shape(labels)) | |
return images, labels | |
def _square_image_preprocess(image, filename, resize_side): | |
""" square image specific preprocessor | |
For resize function, we apply the function used in research/slim. | |
align_corners are set to false. However, in TFOD, align_corners is set to | |
true. | |
Args: | |
image: an image of shape [height, width, channels]. | |
filename: tf.string tensor with shape(), which represents filename. | |
resize_side: a pyhton int indicating the resized side length of the image | |
Return: | |
image: Image shape will be [resize_sied, resize_side, 3] | |
""" | |
# resize | |
with tf.name_scope('SquareImagePreprocess', | |
values=[image, filename, resize_side]): | |
image = _small_aspect_preserving_resize(image, resize_side) | |
# pad to square | |
image = center_pad_to_square(image) | |
tokens = tf.strings.split(tf.expand_dims(filename, 0), sep='--') | |
camera_id = tokens.values[3] | |
# rotate, according to filename! | |
camera_int_id = _get_camera_int_id(filename) | |
image = tf.cond(camera_int_id <= tf.constant(2, dtype= tf.int32), | |
lambda: random_horizontal_flip(random_rotation90(image)), | |
lambda: image) | |
return image | |
def _non_square_image_preprocess(image, output_height, output_width): | |
""" square image specific preprocessor | |
For resize function, we apply the function used in research/slim. | |
align_corners are set to false. However, in TFOD, align_corners is set to | |
true. | |
In research/slim, the resize function is aspect_ration preserving. But we | |
haven't consider this case yet. | |
Args: | |
image: an image of shape [height, width, channels]. | |
output_height: a pyhton int indicating the height of the resized image. | |
output_width: a pyhton int indicating the width of the resized image. | |
Return: | |
image: Image shape will be [output_height, output_width, 3] | |
""" | |
# resize | |
with tf.name_scope('NonSquareImagePreprocess', values=[image]): | |
image = tf.expand_dims(image, 0) | |
image = tf.image.resize_bilinear(image, [output_height, output_width], | |
align_corners=False) | |
image = tf.squeeze(image) | |
return image | |
def _label_hacking(onehot_label, filename): | |
""" Modify the label for experimental testing | |
Args: | |
onehot_label: A `Tensor` of [num_classes] representing the one hot multi-label. | |
filename: A `Tensor` string. | |
Returns: | |
The modified onehot label. | |
""" | |
with tf.name_scope('LabelHacking', values=[onehot_label, filename]): | |
# Modify the label to force rough and thrumark appear at the same time. | |
# These two classes become the same. | |
''' | |
max_val = tf.reduce_max(onehot_label[-2:],keepdims=True) | |
onehot_label = tf.concat([onehot_label[:-2], max_val, max_val],axis=0) | |
''' | |
# Modify the label to force break under normal light as stain | |
# if cam_id <2 or >3 | |
''' | |
def _break_to_stain(onehot_label): | |
stain_label = tf.reduce_max(onehot_label[1:3],keepdims=True) | |
break_label = tf.constant([0]) | |
dummy_label = tf.constant([0]) | |
onehot_label=tf.concat( | |
[dummy_label, break_label,stain_label, onehot_label[3:]],axis=0) | |
return onehot_label | |
cam_id = _get_camera_int_id(filename) | |
onehot_label = tf.cond(tf.logical_or(cam_id < 2, cam_id > 3), | |
lambda: _break_to_stain(onehot_label), | |
lambda: onehot_label) | |
''' | |
return onehot_label | |
def preprocess_for_train_easy(image, bboxes, labels, filename, num_classes, | |
output_height, output_width, use_PCA_noise): | |
"""Preprocesses the given image for training. | |
Note that the actual resizing scale is sampled from | |
[`resize_size_min`, `resize_size_max`]. | |
Args: | |
image: A `Tensor` representing an image of arbitrary size. | |
output_height: The height of the image after preprocessing. | |
output_width: The width of the image after preprocessing. | |
resize_side_min: The lower bound for the smallest side of the image for | |
aspect-preserving resizing. | |
resize_side_max: The upper bound for the smallest side of the image for | |
aspect-preserving resizing. | |
Returns: | |
A preprocessed image. | |
""" | |
image = tf.cond( | |
tf.constant(output_height == output_width, dtype=tf.bool), | |
lambda: center_pad_to_square(_small_aspect_preserving_resize( | |
image,output_height)), | |
lambda: tf.squeeze(tf.image.resize_bilinear(tf.expand_dims(image, 0), | |
[output_height, output_width],align_corners=False)) | |
) | |
image.set_shape([output_height, output_width, 3]) | |
image = tf.to_float(image) | |
label = one_hot_encoding(labels, num_classes=num_classes) | |
image = tf.image.random_flip_left_right(image) | |
image = _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]) | |
return image, label | |
def add_PCA_noise(image, mean=0.0, stddev=0.1): | |
def tf_cov(x): | |
# ref: https://stackoverflow.com/questions/47709854/how-to-get-covariance-matrix-in-tensorflow?rq=1 | |
mean_x = tf.reduce_mean(x, axis=0, keep_dims=True) | |
mx = tf.matmul(tf.transpose(mean_x), mean_x) | |
vx = tf.matmul(tf.transpose(x), x)/tf.cast(tf.shape(x)[0], tf.float32) | |
cov_xx = vx - mx | |
return cov_xx | |
with tf.name_scope('AddPCANoise', values=[image]): | |
image = tf.reshape(image, [-1, 3]) | |
image = tf.cast(image, tf.float32) | |
renorm_image = image - tf.reduce_mean(image, axis=0) | |
renorm_image = renorm_image / tf.keras.backend.std(image, axis=0) | |
cov_matrix = tf_cov(renorm_image) # covariance matrix | |
S, U, V = tf.linalg.svd(cov_matrix) # eigen value(S), eigen vector(U) | |
rand = tf.random_normal(tf.shape(image), mean=mean, stddev=stddev) | |
delta = tf.matmul(rand*tf.expand_dims(S, axis=0), U) | |
image = image + delta | |
image = tf.clip_by_value(image, clip_value_min=0.0, clip_value_max=255.0) | |
return image | |
def preprocess_for_train(image, bboxes, labels, filename, num_classes, | |
output_height, output_width): | |
"""Preprocesses the given image for training. | |
Args: | |
image: A `Tensor` representing an image of arbitrary size. | |
output_height: The height of the image after preprocessing. | |
output_width: The width of the image after preprocessing. | |
Returns: | |
A preprocessed image. | |
""" | |
# random crop to get label and image | |
image, labels = _random_crop(image, bboxes, labels) | |
label = one_hot_encoding(labels, num_classes=num_classes) | |
label = _label_hacking(label, filename) | |
# adjust color | |
# resize, rotate, flip according to the characteristic of the image. | |
# if the image is a square image, then do rotation according to cameraID. | |
image = tf.cond(tf.constant(output_height == output_width, dtype=tf.bool), | |
lambda: _square_image_preprocess(image, filename, output_height), | |
lambda: _non_square_image_preprocess(image, output_height, output_width)) | |
image = tf.to_float(image) | |
image = random_distort_color(image, color_ordering = 2) | |
image.set_shape([output_height, output_width, 3]) | |
# Rotate according to image filename | |
#image = random_horizontal_flip(image) | |
return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]), label | |
def preprocess_for_eval(image, bboxes, labels, filename, num_classes, | |
output_height, output_width): | |
"""Preprocesses the given image for evaluation. | |
Args: | |
image: A `Tensor` representing an image of arbitrary size. | |
output_height: The height of the image after preprocessing. | |
output_width: The width of the image after preprocessing. | |
resize_side: The smallest side of the image for aspect-preserving resizing. | |
Returns: | |
A preprocessed image. | |
""" | |
image = tf.cond( | |
tf.constant(output_height == output_width, dtype=tf.bool), | |
lambda: center_pad_to_square(_small_aspect_preserving_resize( | |
image,output_height)), | |
lambda: tf.squeeze(tf.image.resize_bilinear(tf.expand_dims(image, 0), | |
[output_height, output_width],align_corners=False)) | |
) | |
image.set_shape([output_height, output_width, 3]) | |
image = tf.to_float(image) | |
label = one_hot_encoding(labels, num_classes=num_classes) | |
label = _label_hacking(label, filename) | |
return _mean_image_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]), label | |
def preprocess_for_freezing(image): | |
return _mean_images_subtraction(image, [_R_MEAN, _G_MEAN, _B_MEAN]) | |
def preprocess_image(image, output_height, output_width, | |
bboxes, labels, filename, num_classes, | |
use_more_augmentation=True, is_training=False, | |
is_freezing=False, use_PCA_noise=False): | |
"""Preprocesses the given image. | |
Args: | |
images: rank 3 float32 tensor contains | |
1 image -> [height, width, 3]. | |
with pixel values varying between [0, 1] | |
bboxes: rank 2 float32 tensor containing | |
the bounding boxes -> [N, 4]. | |
Boxes are in normalized form meaning | |
their coordinates vary between [0, 1]. | |
Each row is in the form | |
of [ymin, xmin, ymax, xmax]. | |
output_height: The height of the image after preprocessing. | |
output_width: The width of the image after preprocessing. | |
is_training: `True` if we're preprocessing the image for training and | |
`False` otherwise. | |
resize_side_min: The lower bound for the smallest side of the image for | |
aspect-preserving resizing. If `is_training` is `False`, then this value | |
is used for rescaling. | |
resize_side_max: The upper bound for the smallest side of the image for | |
aspect-preserving resizing. If `is_training` is `False`, this value is | |
ignored. Otherwise, the resize side is sampled from | |
[resize_size_min, resize_size_max]. | |
Returns: | |
A preprocessed image. | |
""" | |
# TODO check if image is rank 4 or 3 | |
#if len(images.get_shape()) != 4: | |
# raise ValueError('images in tensor_dict should be rank 4') | |
#image = tf.squeeze(images, axis=0) | |
if is_training: | |
if use_more_augmentation: | |
return preprocess_for_train(image, bboxes, labels, filename, num_classes, | |
output_height, output_width) | |
else: | |
return preprocess_for_train_easy(image, bboxes, labels, filename, | |
num_classes, output_height, output_width, use_PCA_noise) | |
else: | |
if is_freezing: | |
return preprocess_for_freezing(image) | |
else: | |
return preprocess_for_eval(image, bboxes, labels, filename, num_classes, | |
output_height, output_width) | |
def batch_preprocess_fn(images, labels, filenames, is_training=True): | |
""" | |
Args: | |
images: rank 4 float32 tensor contains | |
N image -> [N, height, width, 3]. | |
with pixel values varying between [-128, 128] | |
labels: rank 2 float32 tensor containing | |
the (multilabel) onehot labels -> [N, num_classes]. | |
filenames: rank1 string tensor contains N string. | |
Return: | |
the preprocessed images and labels | |
""" | |
images, labels = _mix_up_augmentation(images, labels, filenames) | |
return images, labels |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment