Skip to content

Instantly share code, notes, and snippets.

@pranz24
Last active November 18, 2022 02:45
Show Gist options
  • Save pranz24/7023a789e18e449cdb7ba7acae4075a9 to your computer and use it in GitHub Desktop.
Save pranz24/7023a789e18e449cdb7ba7acae4075a9 to your computer and use it in GitHub Desktop.
Solving LunarLander-v2 using CMA-ES in Pybrain
# A small neural network is trained using Covariance Matrix Adaptation Evolution Strategies
# The idea is from this paper: https://arxiv.org/abs/1604.00772
# This gist is using pybrain (http://pybrain.org/)
import logging
import os
import numpy as np
import gym
from gym import wrappers
from pybrain.tools.shortcuts import buildNetwork
from pybrain.structure.modules import TanhLayer
#from pybrain.structure.modules import LinearLayer
#from pybrain.structure.modules import ReluLayer
from pybrain.optimization import CMAES
#Buliding a simple neural network with 3 layers
def buildNet(n_input, n_output):
# The algorithm also converges using linear and relu layer at the hidden or output unit.
return buildNetwork(n_input,16, 12, n_output, hiddenclass=TanhLayer, outclass=TanhLayer, outputbias=False, recurrent=False)
def trainNetwork(env):
# The training functions uses the average of the cumulated reward and maximum height as fitness
def objF(params):
nn = buildNet(len(env.observation_space.high), env.action_space.n)
nn._setParameters(np.array(params))
cum_reward = 0
highest_count = []
episode_count = 15
max_steps = 1200
for i in range(episode_count):
highest = -1
ob = env.reset()
for j in range(max_steps):
result = nn.activate(ob)
action = np.argmax(result)
ob, reward, done, _ = env.step(action)
cum_reward += reward
if highest < ob[0]:
highest = ob[0]
if done:
break
nn.reset()
highest_count.append(highest)
return (sum(highest_count) + cum_reward) / len(highest_count)
# Build net for initial random params
n = buildNet(len(env.observation_space.high), env.action_space.n)
x0 = n.params
l = CMAES(objF, x0, verbose=True)
# Some arbitrary desired fitness value
l.desiredEvaluation = -3600
l.maxLearningSteps = 200
l.mustMaximize
l.minimize = False
learned = l.learn()
return learned
if __name__ == '__main__':
# Initialization
logger = logging.getLogger()
logger.setLevel(logging.INFO)
env = gym.make('LunarLander-v2')
print(env.observation_space)
print(env.action_space)
outdir = '/tmp/CMAES-LunarLander-results'
env = gym.wrappers.Monitor(env, outdir, force=True)
learned = trainNetwork(env)
nn = buildNet(len(env.observation_space.high), env.action_space.n)
nn._setParameters(np.array(learned[0]))
env.close()
# Test the trained network
episode_count = 300
max_steps = 1200
for i in range(episode_count):
ob = env.reset()
for j in range(max_steps):
result = nn.activate(ob)
action = np.argmax(result)
ob, reward, done, _ = env.step(action)
if done:
nn.reset()
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment