-
-
Save samuelcaldas/5625691ab91001574f4715cf6b48ccc2 to your computer and use it in GitHub Desktop.
Keras Actor Critic in TensorFlow.net
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using NumSharp; | |
using Tensorflow.Keras.Layers; | |
using Tensorflow.Keras.Losses; | |
using Tensorflow.Keras.Optimizers; | |
using Tensorflow.Keras.Utils; | |
using static Tensorflow.Binding; | |
using static Tensorflow.KerasApi; | |
using System.Linq; | |
using System.Threading; | |
namespace Tensorflow | |
{ | |
//inspired by Tensorflow sample of Actor Critic Method in Cartpole Environment | |
internal class A2C | |
{ | |
int seed = 42; | |
float gamma = .99f; | |
int max_steps_per_episode = 10000; | |
int num_inputs = 4; | |
int num_actions = 2; | |
int num_hidden = 128; | |
GymEnvironment env; | |
//TODO: Python debugger 1.1920928955078125e-07 == np.finfo(np.float32).eps.item() | |
//NET float epsilon = 1.401298E-45F | |
float eps = 1.1920928955078125e-07F; | |
public A2C() | |
{ | |
env = GymEnvironment.make("CartPole-v0"); | |
env.seed(seed); | |
} | |
//TODO: allow input of action space, input space. | |
Tensors inputs; | |
Tensors common; | |
Tensors action; | |
Tensors critic; | |
Tensors outputs; | |
Keras.Engine.Functional model; | |
private OptimizerV2 optimizer; | |
private ILossFunc huber_loss; | |
private List<Tensor> action_probs_history; | |
private List<Tensor> critic_value_history; | |
private List<float> rewards_history; | |
private float running_reward; | |
private int episode_count; | |
float[] state; | |
private float episode_reward; | |
internal void Run() | |
{ | |
//build the model; | |
var layers = new LayersApi(); | |
inputs = keras.Input(num_inputs, dtype: TF_DataType.TF_FLOAT); | |
common = layers.Dense(num_hidden, activation: "relu").Apply(inputs); | |
action = layers.Dense(num_actions, activation: "softmax").Apply(common); | |
critic = keras.layers.Dense(1).Apply(common); | |
outputs = new Tensors(action, critic); | |
model = keras.Model(inputs, outputs, name: "a2c"); | |
//optimizer = keras.optimizers.Adam(learning_rate = 0.01) | |
optimizer = keras.optimizers.Adam(learning_rate: 0.01f); | |
//huber_loss = keras.losses.Huber() | |
huber_loss = keras.losses.Huber(); | |
//action_probs_history = [] | |
action_probs_history = new List<Tensor>(); | |
//critic_value_history = [] | |
critic_value_history = new List<Tensor>(); | |
//rewards_history = [] | |
rewards_history = new List<float>(); | |
//running_reward = 0 | |
running_reward = 0f; | |
//episode_count = 0 | |
episode_count = 0; | |
train(); | |
} | |
void train() | |
{ | |
while (true)// run until solved | |
{ | |
var tensorstate = env.reset(); | |
episode_reward = 0; | |
using (var tape = tf.GradientTape()) | |
{ | |
for (var timestep = 1; timestep < max_steps_per_episode; timestep++) | |
{ | |
//env.render //TODO: | |
var stateAsTensor2 = tf.expand_dims(tensorstate, axis: 0); | |
//Predict action probabilities and estimated future rewards | |
//from environment state | |
//(action_probs, critic_value) = model(state) | |
var result = model.predict(stateAsTensor2); | |
var action_probs = result[0]; | |
var critic_value = result[1]; | |
var output = critic_value.ToArray<float>(); | |
critic_value_history.Add(critic_value[0, 0]); | |
var propabilities = action_probs.ToArray<double>(); | |
// use custom random choice instead because | |
// np.random.choice throws a not implemented exception | |
// np.random.choice(num_actions, probabilities: propabilities); | |
var action = RandomChoice.Choice(num_actions, propabilities); | |
var loginput = propabilities[action]; | |
var probLog = tf.math.log(action_probs[0, action]); | |
action_probs_history.append(probLog); | |
//state, reward, done, _ = env.step(action); | |
var stepResult = env.step(action); | |
tensorstate = stepResult.state; | |
state = stepResult.state.ToArray<float>(); | |
var reward = stepResult.reward; | |
rewards_history.Add(reward); | |
episode_reward += reward; | |
//rewards_history.append(reward) | |
//episode_reward += reward | |
//action_probs_history.append(tf.math.log(action_probs[0, action])) | |
if (stepResult.done) | |
break; | |
} | |
// # Update running reward to check condition for solving | |
running_reward = 0.05f * episode_reward + (1 - 0.05f) * running_reward; | |
//# Calculate expected value from rewards | |
//# - At each timestep what was the total reward received after that timestep | |
//# - Rewards in the past are discounted by multiplying them with gamma | |
//# - These are the labels for our critic | |
var returns = new List<float>(); | |
var discounted_sum = 0f; | |
for (var i = 0; i < rewards_history.Count; i++) | |
{ | |
var r = rewards_history[i]; | |
discounted_sum = r + gamma * discounted_sum; | |
returns.Insert(0, discounted_sum); | |
} | |
var npReturns = np.array(returns.ToArray()); | |
npReturns = (npReturns - np.mean(npReturns)) / (np.std(npReturns) + eps); | |
var returnsASList = npReturns.ToArray<float>(); | |
//history = zip(action_probs_history, critic_value_history, returns) | |
var actor_losses = new List<Tensor>(); | |
var critic_losses = new List<Tensor>(); | |
for (var i = 0; i < action_probs_history.Count; i++) | |
{ | |
var log_prob = action_probs_history[i]; | |
var value = critic_value_history[i]; | |
var ret = returns[i]; | |
// At this point in history, the critic estimated that we would get a | |
// total reward = `value` in the future. We took an action with log probability | |
// of `log_prob` and ended up recieving a total reward = `ret`. | |
// The actor must be updated so that it predicts an action that leads to | |
// high rewards (compared to critic's estimate) with high probability. | |
var diff = ret - value; | |
actor_losses.Add(diff); | |
// The critic must be updated so that it predicts a better estimate of | |
// the future rewards. | |
var retTensor = tf.convert_to_tensor(ret); | |
var loss = huber_loss.Call(tf.expand_dims(value, 0), tf.expand_dims(retTensor, 0)); | |
critic_losses.Add(loss); | |
} | |
//var loss_value = sum(actor_losses) + sum(critic_losses); //broken | |
var actor_losses_sum = actor_losses.SelectMany(x => x.ToArray<float>()).Sum(); | |
var critic_losses_sum = critic_losses.SelectMany(x => x.ToArray<float>()).Sum(); | |
// Backpropagation | |
float loss_value = actor_losses_sum + critic_losses_sum; | |
Tensor loss_value_tensor = tf.convert_to_tensor(loss_value); | |
var grads = tape.gradient(loss_value_tensor, model.trainable_variables); | |
//optimizer.apply_gradients(zip(grads, model.trainable_variables)) | |
var zipped = grads.Zip(model.trainable_variables.Cast<ResourceVariable>()).ToList(); | |
optimizer.apply_gradients(zipped); | |
// Clear the loss and reward history | |
action_probs_history.Clear(); | |
critic_value_history.Clear(); | |
rewards_history.Clear(); | |
} | |
episode_count += 1; | |
if (episode_count % 10 == 0) | |
{ | |
Console.WriteLine($"running reward: {running_reward.ToString("N2")} at {episode_count}"); | |
} | |
if (running_reward > 195) //# Condition to consider the task solved | |
{ | |
print($"Solved at episode {episode_count}!"); | |
break; | |
} | |
} | |
//Log details | |
} | |
} | |
public static class RandomChoice | |
{ | |
//from: https://stackoverflow.com/a/43345968/624988 | |
static readonly ThreadLocal<Random> _random = new ThreadLocal<Random>(() => new Random()); | |
static IEnumerable<T> Choice<T>(IList<T> sequence, int size, double[] distribution) | |
{ | |
double sum = 0; | |
// first change shape of your distribution probablity array | |
// we need it to be cumulative, that is: | |
// if you have [0.1, 0.2, 0.3, 0.4] | |
// we need [0.1, 0.3, 0.6, 1 ] instead | |
var cumulative = distribution.Select(c => | |
{ | |
var result = c + sum; | |
sum += c; | |
return result; | |
}).ToList(); | |
for (int i = 0; i < size; i++) | |
{ | |
// now generate random double. It will always be in range from 0 to 1 | |
var r = _random.Value.NextDouble(); | |
// now find first index in our cumulative array that is greater or equal generated random value | |
var idx = cumulative.BinarySearch(r); | |
// if exact match is not found, List.BinarySearch will return index of the first items greater than passed value, but in specific form (negative) | |
// we need to apply ~ to this negative value to get real index | |
if (idx < 0) | |
idx = ~idx; | |
if (idx > cumulative.Count - 1) | |
idx = cumulative.Count - 1; // very rare case when probabilities do not sum to 1 becuase of double precision issues (so sum is 0.999943 and so on) | |
// return item at given index | |
yield return sequence[idx]; | |
} | |
} | |
public static T Choice<T>(IList<T> sequence, double[] distribution) | |
{ | |
return Choice(sequence, 1, distribution).First(); | |
} | |
public static int Choice(int upTo, double[] distribution) | |
{ | |
return Choice(Enumerable.Range(0, upTo).ToArray(), distribution); | |
} | |
} | |
class GymEnvironments | |
{ | |
public const string CartPolev0 = "CartPole-v0"; | |
} | |
public interface ILogger | |
{ | |
void warn(string message); | |
} | |
public class ConsoleLogger : ILogger | |
{ | |
public void warn(string message) | |
{ | |
Console.ForegroundColor = ConsoleColor.Yellow; | |
Console.WriteLine(message); | |
Console.ResetColor(); | |
} | |
} | |
public abstract class GymEnvironment | |
{ | |
protected ILogger logger = new ConsoleLogger(); | |
internal static GymEnvironment make(string v) | |
{ | |
switch (v) | |
{ | |
case GymEnvironments.CartPolev0: | |
return new CartPolev0(); | |
default: | |
throw new NotImplementedException(); | |
} | |
} | |
public abstract void seed(int seed); | |
public abstract Tensor reset(); | |
public virtual EnvorinmentStepResult step(int action) | |
{ | |
throw new NotImplementedException(); | |
} | |
} | |
public class CartPolev0 : GymEnvironment | |
{ | |
private float gravity; | |
private float masscart; | |
private float masspole; | |
private float total_mass; | |
private float length; | |
private float polemass_length; | |
private float force_mag; | |
private float tau; | |
private string kinematics_integrator; | |
private float theta_threshold_radians; | |
private float x_threshold; | |
private NDArray high; | |
private spaces action_space; | |
private spaces observation_space; | |
private NDArray state; | |
private int? steps_beyond_done; | |
NumPyRandom rng; | |
public CartPolev0() | |
{ | |
this.gravity = 9.8f; | |
this.masscart = 1.0f; | |
this.masspole = 0.1f; | |
this.total_mass = (this.masspole + this.masscart); | |
this.length = 0.5f; // actually half the pole's length | |
this.polemass_length = (this.masspole * this.length); | |
this.force_mag = 10.0f; | |
this.tau = 0.02f; //seconds between state updates | |
this.kinematics_integrator = "euler"; | |
//# Angle at which to fail the episode | |
this.theta_threshold_radians = (float)(12 * 2 * Math.PI / 360); | |
this.x_threshold = 2.4f; | |
// Angle limit set to 2 * theta_threshold_radians so failing observation | |
// is still within bounds. | |
var highValues = new float[] {(float)this.x_threshold * 2, | |
float.MaxValue, //np.finfo(np.float32).max, | |
(float)this.theta_threshold_radians * 2, | |
float.MaxValue}; //, //np.finfo(np.float32).max], | |
this.high = np.array(highValues); | |
var negHigh = high.negative(); | |
this.action_space = spaces.Discrete(2); | |
this.observation_space = spaces.Box(negHigh, high, np.float32); | |
//this.seed(); | |
//this.viewer = None; | |
this.state = null; | |
this.steps_beyond_done = null; | |
} | |
int _seed; | |
public override void seed(int seed) | |
{ | |
this._seed = seed; | |
rng = np.random.RandomState(seed); | |
} | |
public override Tensor reset() | |
{ | |
// random_ops.random_uniform(new int[] { }, minval: -0.05f, maxval: 0.05f); | |
var result = rng.uniform(-0.05f, 0.05f, (4)); | |
var asFloat = result.astype(NPTypeCode.Float); | |
this.state = asFloat; | |
steps_beyond_done = null; | |
return np.array(asFloat); | |
} | |
public override EnvorinmentStepResult step(int action) | |
{ | |
// err_msg = "%r (%s) invalid" % (action, type(action)) | |
//assert self.action_space.contains(action), err_msg | |
var stateAsArray = this.state.ToArray<float>(); | |
float x = stateAsArray[0]; | |
float x_dot = stateAsArray[1]; | |
float theta = stateAsArray[2]; | |
float theta_dot = stateAsArray[3]; | |
float force = action == 1 ? this.force_mag : -this.force_mag; | |
float costheta = (float)Math.Cos(theta); | |
float sintheta = (float)Math.Sin(theta); | |
// For the interested reader: | |
// https://coneural.org/florian/papers/05_cart_pole.pdf | |
float temp = (force + this.polemass_length * (float)Math.Pow(theta_dot, 2) * sintheta) / this.total_mass; | |
float thetaacc = (this.gravity * sintheta - costheta * temp) / (this.length * (4.0f / 3.0f - this.masspole * (float)Math.Pow(costheta, 2) / this.total_mass)); | |
float xacc = temp - this.polemass_length * thetaacc * costheta / this.total_mass; | |
if (this.kinematics_integrator == "euler") | |
{ | |
x = x + this.tau * x_dot; | |
x_dot = x_dot + this.tau * xacc; | |
theta = theta + this.tau * theta_dot; | |
theta_dot = theta_dot + this.tau * thetaacc; | |
} | |
else | |
{ //: // semi-implicit euler | |
x_dot = x_dot + this.tau * xacc; | |
x = x + this.tau * x_dot; | |
theta_dot = theta_dot + this.tau * thetaacc;// | |
theta = theta + this.tau * theta_dot; | |
} | |
this.state = new[] { x, x_dot, theta, theta_dot }; | |
var done = ( | |
x < -this.x_threshold | |
|| x > this.x_threshold | |
|| theta < -this.theta_threshold_radians | |
|| theta > this.theta_threshold_radians | |
); | |
var reward = 0f; | |
if (!done) | |
{ | |
reward = 1.0f; | |
} | |
else if (this.steps_beyond_done is null) | |
{ | |
this.steps_beyond_done = 0; | |
reward = 1.0f; | |
} | |
// Pole just fell! | |
else | |
{ | |
if (this.steps_beyond_done == 0) | |
logger.warn( | |
"You are calling 'step()' even though this " + | |
"environment has already returned done = True. You " + | |
"should always call 'reset()' once you receive 'done = " + | |
"True' -- any further steps are undefined behavior." | |
); | |
this.steps_beyond_done += 1; | |
} | |
//return (np.array(this.state), reward, done, new object[] { }); | |
var result = new EnvorinmentStepResult | |
{ | |
state = np.array(this.state), | |
reward = reward, | |
done = done, | |
data = new object[] { } | |
}; | |
return result; | |
} | |
} | |
public class EnvorinmentStepResult | |
{ | |
public Tensor state; | |
public float reward; | |
public bool done; | |
public object data; | |
} | |
public class spaces | |
{ | |
public static Discrete Discrete(int value) => new Discrete(value); | |
internal static Box Box(NDArray x, NDArray y, Type dtype) | |
=> new Box(x, y, dtype); | |
} | |
public class Discrete : spaces | |
{ | |
public Discrete(int size) | |
{ | |
this.Size = size; | |
} | |
public int Size { get; } | |
} | |
public class Box : spaces | |
{ | |
public Box(NDArray x, NDArray y, Type dtype) | |
{ | |
X = x; | |
Y = y; | |
Dtype = dtype; | |
} | |
public NDArray X { get; } | |
public NDArray Y { get; } | |
public Type Dtype { get; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment