rmsander · August 7, 2021 02:24
diff --git a/uniform_same_class_selection_load_batches.py b/uniform_same_class_selection_load_batches.py
 import numpy as np
 import tensorflow as tf

 def vector_same_class_sample_from_file(Y, y_sample, base_paths_by_class=None, X=None, num_classes=2):
  """Function for generating equal-class Mixup pairs in a vectorized fashion.
  Essentially, what we'll do is separate X into different sets according to the
  class of the corresponding index in Y. Then, given y_sample, a set of classes
  of the first set of images/vectors that we use for Mixup, we pair the same classes
  together appropriately by only sampling corresponding same-class images/vectors
  from X using appropriate indexing.
  
  Unlike the other vector_same_class_sample function, this function loads batches of
  images at a time from file (i.e. is cognizant of large file constraints).
  
  NOTE: Either base_paths_by_class or X must be specified.

  Parameters:
    Y (tf.Tensor): TensorFlow tensor representing the target values used for
      training the model. Expects shape (N, 1) if sing binary classification, and
      (N, M) if multiclass classification, where M is the number of channels.
    y_sample (tf.Tensor): A sample of labels of shape (B, 1) - binary classification
      or (B, M) - multiclass classification, where B is the batch size and M is
      the number of classes considered.
    base_paths_by_class (list): A list of strings, where each string element provides
      the path to the given set of images. Assumes that each image can be indexed using
      a numeric index, e.g. i = 1. Defaults to None.
    X (tf.Tensor): TensorFlow tensor representing the input values used for
      prediction. Expects shape (N, W, H, C), where N is the total dataset size, W is
      the width, H is the height, and C is the channels if we are using images, or
      (N, D) if using vectors, where D is the dimensionality of the vector. Defaults
      to None.
    num_classes (int): The total number of classes in this classification problem.
      Defaults to 2 (binary classification).

  Returns:
    X_sample (tf.Tensor): A batch of inputs corresponding to the second set of
      inputs to be used for Mixup. This should have shape (B,) + X.shape.
  """
  # Create as NumPy arrays for easier manipulation
  Y_np = Y.numpy()
  y_sample_np = y_sample.numpy()
  
  # If X provided, convert to NumPy file
  if X is not None:
    X_np = X.numpy()
  
  # Create datasets
  D_idx = {}
  for i in range(num_classes):
  
    # Now take "slice of dataset" using these indices
    D_idx[i] = np.argwhere(Y_np == i)

  # Initalize output array
  output_shape = (y_sample.numpy().shape[0],) + X_np.shape[1:]
  X_sample = np.zeros(output_shape)

  # Now sample from datasets
  for j in range(num_classes):

    # Classes from sample
    idx_class_sample = np.argwhere(y_sample_np == j).reshape(-1)  # Samples where y = j
    N = idx_class_sample.size  # Number of these we need to sample from X

    # Indices to sample from
    idx_choice = D_idx[j].reshape(-1)  # Look up eligible indices

    # Now sample this many samples from X 
    sampled_idx = np.random.choice(idx_choice, size=N, replace=True)  # Sample uniformly (iid) from "eligible" indices
    
    # From this, construct applicable filenmaes to load from file
    # TODO: load images from file as a numpy array or tiff
    if base_paths_by_class is not None:
      file_names = [base_paths_by_class[j].format(sample) for sample in sampled_idx]
      X_sample[idx_class_sample] = np.concatenate([cv.imread(f) for f in file_names])
    
    # Assumes we are given the whole dataset
    else:
      X_sample[idx_class_sample] = X_np[sampled_idx]  # Set output array for matching labels with sampled "eligible" indices
  
  return tf.convert_to_tensor(X_sample)

 # Test

 # Create datasets
 Y1 = tf.convert_to_tensor(np.zeros(500))
 Y2 = tf.convert_to_tensor(np.ones(500))
 Y3 = 2 * tf.convert_to_tensor(np.ones(500))
 Y = tf.concat([Y1, Y2, Y3], axis=0)

 X1 = tf.convert_to_tensor(np.zeros((500, 10)))
 X2 = tf.convert_to_tensor(np.ones((500, 10)))
 X3 = 2 * tf.convert_to_tensor(np.ones((500, 10)))
 X = tf.concat([X1, X2, X3], axis=0)

 # Now randomly sample from X and Y
 idx_sample = np.random.choice(1500, size=32).reshape((-1, 1))
 y_sample = tf.gather_nd(Y, idx_sample)

 # Now run function above
 X_sample = vector_same_class_sample_from_file(Y, y_sample, num_classes=3, X=X)

 print("Y SAMPLE: {}".format(y_sample))

 print("X SAMPLE: {}".format(X_sample))

 # Should pass this test
 assert np.all(np.mean(X_sample, axis=1) == y_sample)
 print("TEST PASSED!")
	import numpy as np
	import tensorflow as tf

	def vector_same_class_sample_from_file(Y, y_sample, base_paths_by_class=None, X=None, num_classes=2):
	"""Function for generating equal-class Mixup pairs in a vectorized fashion.
	Essentially, what we'll do is separate X into different sets according to the
	class of the corresponding index in Y. Then, given y_sample, a set of classes
	of the first set of images/vectors that we use for Mixup, we pair the same classes
	together appropriately by only sampling corresponding same-class images/vectors
	from X using appropriate indexing.

	Unlike the other vector_same_class_sample function, this function loads batches of
	images at a time from file (i.e. is cognizant of large file constraints).

	NOTE: Either base_paths_by_class or X must be specified.

	Parameters:
	Y (tf.Tensor): TensorFlow tensor representing the target values used for
	training the model. Expects shape (N, 1) if sing binary classification, and
	(N, M) if multiclass classification, where M is the number of channels.
	y_sample (tf.Tensor): A sample of labels of shape (B, 1) - binary classification
	or (B, M) - multiclass classification, where B is the batch size and M is
	the number of classes considered.
	base_paths_by_class (list): A list of strings, where each string element provides
	the path to the given set of images. Assumes that each image can be indexed using
	a numeric index, e.g. i = 1. Defaults to None.
	X (tf.Tensor): TensorFlow tensor representing the input values used for
	prediction. Expects shape (N, W, H, C), where N is the total dataset size, W is
	the width, H is the height, and C is the channels if we are using images, or
	(N, D) if using vectors, where D is the dimensionality of the vector. Defaults
	to None.
	num_classes (int): The total number of classes in this classification problem.
	Defaults to 2 (binary classification).

	Returns:
	X_sample (tf.Tensor): A batch of inputs corresponding to the second set of
	inputs to be used for Mixup. This should have shape (B,) + X.shape.
	"""
	# Create as NumPy arrays for easier manipulation
	Y_np = Y.numpy()
	y_sample_np = y_sample.numpy()

	# If X provided, convert to NumPy file
	if X is not None:
	X_np = X.numpy()

	# Create datasets
	D_idx = {}
	for i in range(num_classes):

	# Now take "slice of dataset" using these indices
	D_idx[i] = np.argwhere(Y_np == i)

	# Initalize output array
	output_shape = (y_sample.numpy().shape[0],) + X_np.shape[1:]
	X_sample = np.zeros(output_shape)

	# Now sample from datasets
	for j in range(num_classes):

	# Classes from sample
	idx_class_sample = np.argwhere(y_sample_np == j).reshape(-1) # Samples where y = j
	N = idx_class_sample.size # Number of these we need to sample from X

	# Indices to sample from
	idx_choice = D_idx[j].reshape(-1) # Look up eligible indices

	# Now sample this many samples from X
	sampled_idx = np.random.choice(idx_choice, size=N, replace=True) # Sample uniformly (iid) from "eligible" indices

	# From this, construct applicable filenmaes to load from file
	# TODO: load images from file as a numpy array or tiff
	if base_paths_by_class is not None:
	file_names = [base_paths_by_class[j].format(sample) for sample in sampled_idx]
	X_sample[idx_class_sample] = np.concatenate([cv.imread(f) for f in file_names])

	# Assumes we are given the whole dataset
	else:
	X_sample[idx_class_sample] = X_np[sampled_idx] # Set output array for matching labels with sampled "eligible" indices

	return tf.convert_to_tensor(X_sample)

	# Test

	# Create datasets
	Y1 = tf.convert_to_tensor(np.zeros(500))
	Y2 = tf.convert_to_tensor(np.ones(500))
	Y3 = 2 * tf.convert_to_tensor(np.ones(500))
	Y = tf.concat([Y1, Y2, Y3], axis=0)

	X1 = tf.convert_to_tensor(np.zeros((500, 10)))
	X2 = tf.convert_to_tensor(np.ones((500, 10)))
	X3 = 2 * tf.convert_to_tensor(np.ones((500, 10)))
	X = tf.concat([X1, X2, X3], axis=0)

	# Now randomly sample from X and Y
	idx_sample = np.random.choice(1500, size=32).reshape((-1, 1))
	y_sample = tf.gather_nd(Y, idx_sample)

	# Now run function above
	X_sample = vector_same_class_sample_from_file(Y, y_sample, num_classes=3, X=X)

	print("Y SAMPLE: {}".format(y_sample))

	print("X SAMPLE: {}".format(X_sample))

	# Should pass this test
	assert np.all(np.mean(X_sample, axis=1) == y_sample)
	print("TEST PASSED!")