Last active
September 6, 2021 18:58
-
-
Save rmsander/3cf3f70c615a6a46fcfd96681ec55ae9 to your computer and use it in GitHub Desktop.
Functions and script to generate a reward surface with state and action inputs. Configured for the Pendulum-v0 Gym environment.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Functions and script to generate a reward surface with state and | |
action inputs. Configured for the Pendulum-v0 Gym environment.""" | |
import gym | |
import matplotlib.pyplot as plt | |
import numpy as np | |
# Parameters | |
SAMPLES = 50 | |
SEED = 42 | |
BOUNDS = [(-np.pi, np.pi), (-8, 8), (-2, 2)] | |
# Set seeds | |
np.random.seed(seed=SEED) # Set seed for NumPy RNG | |
def sample_uniformly(N, bounds): | |
"""Utility function for sampling points iid from 3D space.""" | |
# Get bounds | |
[(low_s1, high_s1), (low_s2, high_s2), (low_a, high_a)] = bounds | |
N_cubed = N**3 | |
# Now sample each dimension | |
s1 = np.random.uniform(low=low_s1, high=high_s1, size=N_cubed) | |
s2 = np.random.uniform(low=low_s2, high=high_s2, size=N_cubed) | |
a = np.random.uniform(low=low_a, high=high_a, size=N_cubed) | |
# Now combine | |
X = np.stack((s1, s2, a), axis=-1) | |
Y = np.zeros(N_cubed) | |
for k in range(N_cubed): # Iterate over generated samples | |
Y[k] = compute_reward(X[k]) | |
return X, Y | |
def plot_points(X, Y, title="Analytic Reward Over State-Action Space"): | |
"""Utility function for plotting points.""" | |
fig = plt.figure() | |
ax = fig.add_subplot(111, projection='3d') | |
ax.set_xlabel('Theta (th)') | |
ax.set_ylabel('Theta dot (thdot)') | |
ax.set_zlabel('Action (u)') | |
img = ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=Y, cmap=plt.hot()) | |
fig.colorbar(img, orientation="horizontal", pad=0.03) | |
fig.tight_layout() | |
plt.title(title) | |
plt.show() | |
def angle_normalize(x): | |
return (((x+np.pi) % (2*np.pi)) - np.pi) | |
def compute_reward(x): | |
th, thdot, u = x | |
"""Analytic function to compute reward given states and action.""" | |
return -(angle_normalize(th) ** 2 + .1 * thdot ** 2 + .001 * (u ** 2)) | |
def generate_lin_space_samples(N, bounds): | |
"""Function for generating samples from a linear space.""" | |
# Get bounds | |
[(low_s1, high_s1), (low_s2, high_s2), (low_a, high_a)] = bounds | |
# Step sizes | |
step_s1 = (high_s1 - low_s1) / N | |
step_s2 = (high_s2 - low_s2) / N | |
step_a = (high_a - low_a) / N | |
# Create space | |
X = np.mgrid[low_s1:high_s1:step_s1, | |
low_s2:high_s2:step_s2, | |
low_a:high_a:step_a].reshape(3, -1).T | |
# Generate rewards | |
N_cubed = N**3 | |
Y = np.zeros(N_cubed) | |
for k in range(N_cubed): # Iterate over generated samples | |
Y[k] = compute_reward(X[k]) | |
# Now compute the reward at each of these points | |
return X, Y | |
def main(): | |
X, Y = generate_lin_space_samples(SAMPLES, BOUNDS) | |
plot_points(X, Y) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment