kastnerkyle · October 22, 2020 11:05
diff --git a/batch_ar_example.py b/batch_ar_example.py
 import numpy as np

 # make a minibatch of time, batch, features
 # time length 7 
 # batch size 2
 # feature dimension 4: 
 # 1:4, 10:14, 20:24, 30:34, etc for first minibatch element
 # 5:8, 15:18, etc second minibatch el
 n_features = 4
 n_timesteps = 7
 base_mb1_features = np.arange(n_features) + 1
 time_mb1_features = 10 * np.arange(n_timesteps)[:, None] + base_mb1_features[None]

 base_mb2_features = np.arange(n_features) + 5 + 1
 time_mb2_features = 10 * np.arange(n_timesteps)[:, None] + base_mb2_features[None]

 data = np.concatenate((time_mb1_features[:, None], time_mb2_features[:, None]), axis=1)

 time_len = data.shape[0]
 minibatch_size = data.shape[1]
 features = data.shape[2]

 # for each example [0, 6) and [6, 12), we will make an autoregressive mask and equivalent targets for each step

 # new assumption is that the "feature" dimension is the one for autoregression
 # should be more natural compared to the previous example

 # 1, 2, 3, 4 -> in: 0, 0, 0, 0, ; target: 1
 # 1, 2, 3, 4 -> in: 1, 0, 0, 0, ; target: 2
 # 1, 2, 3, 4 -> in: 1, 2, 0, 0, ; target: 3
 # 1, 2, 3, 4 -> in: 1, 2, 3, 0, ; target: 4

 # accomplished using np.triu with 1 argument
 # [[1, 1, 1]
 #  [1, 1, 1]
 #  [1, 1, 1]] -> [[0, 1, 1]
 #                 [0, 0, 1]
 #                 [0, 0, 0]]
 mask_array = np.triu(np.ones((features, features)), k=1)

 # now we have a 4, 4 we want to multiply across 7, 2, 1
 # 
 # results in 4, 4, 7, 2 which is basically features, "feature time" (autoregressive), time, minibatch
 masked_and_copied = mask_array[:, :, None, None] * data.transpose(2, 0, 1)[:, None]

 # there is still extra 0s at the end but we leave them alone for now

 # now we transpose it to 
 # "feature_time", time, minibatch, features
 masked_and_copied = masked_and_copied.transpose(1, 2, 3, 0)

 # 0th "timestep", 0th element, looping over the "autoregressive" axis we see
 # masked_and_copied[0, 0, 0] = [0, 0, 0, 0]
 # masked_and_copied[0, 0, 1] = [1, 0, 0, 0]
 # masked_and_copied[0, 0, 2] = [1, 2, 0, 0]
 # masked_and_copied[0, 0, 3] = [1, 2, 3, 0]

 ar_data = masked_and_copied.copy()
 ar_data_shape = ar_data.shape

 #  make the targets
 ar_targets = data.transpose(2, 0, 1)[..., None]
 ar_targets_shape = ar_targets.shape

 # we rearrange (and unarrange) with this function to make the "normal" training scheme of minibatch, features
 def ar_minibatch_conversion(arr, original_shape=None, inverse=False):
    # expects 
    # "feature_time", time, minibatch, features
    # skip total blank?
    if inverse == False:
        # ar, t, mb, f -> t, ar * mb, f
        # particularly, we want the masked groups in order so that we can do a reshape/structured sum to average them
        shp = arr.shape
        arr = arr.transpose(1, 2, 0, 3)
        arr = arr.reshape(shp[1], shp[2] * shp[0], shp[3])
        # now we have t, ar * mb, f
        # *each* of these can be fed through a network, and effectively we ran all the timesteps in parallel (assuming not passing hidden info)
        return arr
    else:
        # need to invert the old procedure, original shape is REQUIRED
        shp = original_shape
        arr = arr.reshape(shp[1], shp[2], shp[0], shp[3])
        arr = arr.transpose(2, 0, 1, 3)
        return arr

 flat_data = ar_minibatch_conversion(ar_data)
 orig_data = ar_minibatch_conversion(flat_data, ar_data_shape, inverse=True)

 flat_targets = ar_minibatch_conversion(ar_targets)
 orig_targets = ar_minibatch_conversion(flat_targets, ar_targets_shape, inverse=True)
 # normally you would do something like step_preds = f(flat_data)
 # per_step_loss = (step_preds - flat_targets) ** 2
 # loss = loss.sum() or loss = loss.mean()
 # loss.backwards()

 for i in range(n_features):
    print("flat")
    # show that it is chunkwise blocked into the minibatch
    print(flat_data[:, i])
    print(flat_targets[:, i])

 # this should match the previous
 # this setup is available if you wanted to do more structured losses than just averaged per step
 for i in range(n_features):
    print("orig")
    print(orig_data[:, i, 0])
    print(orig_targets[:, i, 0])
	import numpy as np

	# make a minibatch of time, batch, features
	# time length 7
	# batch size 2
	# feature dimension 4:
	# 1:4, 10:14, 20:24, 30:34, etc for first minibatch element
	# 5:8, 15:18, etc second minibatch el
	n_features = 4
	n_timesteps = 7
	base_mb1_features = np.arange(n_features) + 1
	time_mb1_features = 10 * np.arange(n_timesteps)[:, None] + base_mb1_features[None]

	base_mb2_features = np.arange(n_features) + 5 + 1
	time_mb2_features = 10 * np.arange(n_timesteps)[:, None] + base_mb2_features[None]

	data = np.concatenate((time_mb1_features[:, None], time_mb2_features[:, None]), axis=1)

	time_len = data.shape[0]
	minibatch_size = data.shape[1]
	features = data.shape[2]

	# for each example [0, 6) and [6, 12), we will make an autoregressive mask and equivalent targets for each step

	# new assumption is that the "feature" dimension is the one for autoregression
	# should be more natural compared to the previous example

	# 1, 2, 3, 4 -> in: 0, 0, 0, 0, ; target: 1
	# 1, 2, 3, 4 -> in: 1, 0, 0, 0, ; target: 2
	# 1, 2, 3, 4 -> in: 1, 2, 0, 0, ; target: 3
	# 1, 2, 3, 4 -> in: 1, 2, 3, 0, ; target: 4

	# accomplished using np.triu with 1 argument
	# [[1, 1, 1]
	# [1, 1, 1]
	# [1, 1, 1]] -> [[0, 1, 1]
	# [0, 0, 1]
	# [0, 0, 0]]
	mask_array = np.triu(np.ones((features, features)), k=1)

	# now we have a 4, 4 we want to multiply across 7, 2, 1
	#
	# results in 4, 4, 7, 2 which is basically features, "feature time" (autoregressive), time, minibatch
	masked_and_copied = mask_array[:, :, None, None] * data.transpose(2, 0, 1)[:, None]

	# there is still extra 0s at the end but we leave them alone for now

	# now we transpose it to
	# "feature_time", time, minibatch, features
	masked_and_copied = masked_and_copied.transpose(1, 2, 3, 0)

	# 0th "timestep", 0th element, looping over the "autoregressive" axis we see
	# masked_and_copied[0, 0, 0] = [0, 0, 0, 0]
	# masked_and_copied[0, 0, 1] = [1, 0, 0, 0]
	# masked_and_copied[0, 0, 2] = [1, 2, 0, 0]
	# masked_and_copied[0, 0, 3] = [1, 2, 3, 0]

	ar_data = masked_and_copied.copy()
	ar_data_shape = ar_data.shape

	# make the targets
	ar_targets = data.transpose(2, 0, 1)[..., None]
	ar_targets_shape = ar_targets.shape

	# we rearrange (and unarrange) with this function to make the "normal" training scheme of minibatch, features
	def ar_minibatch_conversion(arr, original_shape=None, inverse=False):
	# expects
	# "feature_time", time, minibatch, features
	# skip total blank?
	if inverse == False:
	# ar, t, mb, f -> t, ar * mb, f
	# particularly, we want the masked groups in order so that we can do a reshape/structured sum to average them
	shp = arr.shape
	arr = arr.transpose(1, 2, 0, 3)
	arr = arr.reshape(shp[1], shp[2] * shp[0], shp[3])
	# now we have t, ar * mb, f
	# each of these can be fed through a network, and effectively we ran all the timesteps in parallel (assuming not passing hidden info)
	return arr
	else:
	# need to invert the old procedure, original shape is REQUIRED
	shp = original_shape
	arr = arr.reshape(shp[1], shp[2], shp[0], shp[3])
	arr = arr.transpose(2, 0, 1, 3)
	return arr

	flat_data = ar_minibatch_conversion(ar_data)
	orig_data = ar_minibatch_conversion(flat_data, ar_data_shape, inverse=True)

	flat_targets = ar_minibatch_conversion(ar_targets)
	orig_targets = ar_minibatch_conversion(flat_targets, ar_targets_shape, inverse=True)
	# normally you would do something like step_preds = f(flat_data)
	# per_step_loss = (step_preds - flat_targets) ** 2
	# loss = loss.sum() or loss = loss.mean()
	# loss.backwards()

	for i in range(n_features):
	print("flat")
	# show that it is chunkwise blocked into the minibatch
	print(flat_data[:, i])
	print(flat_targets[:, i])

	# this should match the previous
	# this setup is available if you wanted to do more structured losses than just averaged per step
	for i in range(n_features):
	print("orig")
	print(orig_data[:, i, 0])
	print(orig_targets[:, i, 0])