Created
January 17, 2017 04:02
-
-
Save rajshah4/677698166b860bc11d34507ee18b2d41 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""classic Acrobot task""" | |
##Updated by Rajiv Shah to get the pendulum to balance for an extended time | |
from gym import core, spaces | |
from gym.utils import seeding | |
import numpy as np | |
from numpy import sin, cos, pi | |
import time | |
__copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy" | |
__credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann", | |
"William Dabney", "Jonathan P. How"] | |
__license__ = "BSD 3-Clause" | |
__author__ = "Christoph Dann <[email protected]>" | |
# SOURCE: | |
# https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py | |
class AcrobotEnv(core.Env): | |
""" | |
Acrobot is a 2-link pendulum with only the second joint actuated | |
Intitially, both links point downwards. The goal is to swing the | |
end-effector at a height at least the length of one link above the base. | |
Both links can swing freely and can pass by each other, i.e., they don't | |
collide when they have the same angle. | |
**STATE:** | |
The state consists of the two rotational joint angles and their velocities | |
[theta1 theta2 thetaDot1 thetaDot2]. An angle of 0 corresponds to corresponds | |
to the respective link pointing downwards (angles are in world coordinates). | |
**ACTIONS:** | |
The action is either applying +1, 0 or -1 torque on the joint between | |
the two pendulum links. | |
.. note:: | |
The dynamics equations were missing some terms in the NIPS paper which | |
are present in the book. R. Sutton confirmed in personal correspondance | |
that the experimental results shown in the paper and the book were | |
generated with the equations shown in the book. | |
However, there is the option to run the domain with the paper equations | |
by setting book_or_nips = 'nips' | |
**REFERENCE:** | |
.. seealso:: | |
R. Sutton: Generalization in Reinforcement Learning: | |
Successful Examples Using Sparse Coarse Coding (NIPS 1996) | |
.. seealso:: | |
R. Sutton and A. G. Barto: | |
Reinforcement learning: An introduction. | |
Cambridge: MIT press, 1998. | |
.. warning:: | |
This version of the domain uses the Runge-Kutta method for integrating | |
the system dynamics and is more realistic, but also considerably harder | |
than the original version which employs Euler integration, | |
see the AcrobotLegacy class. | |
""" | |
metadata = { | |
'render.modes': ['human', 'rgb_array'], | |
'video.frames_per_second' : 15 | |
} | |
dt = 0.2 | |
LINK_LENGTH_1 = 1. # [m] #was 1 | |
LINK_LENGTH_2 = 1. # [m] | |
LINK_MASS_1 = 1. #: [kg] mass of link 1 | |
LINK_MASS_2 = 1. #: [kg] mass of link 2 | |
LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 | |
LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 | |
LINK_MOI = 1. #: moments of inertia for both links | |
MAX_VEL_1 = 4/9 * np.pi ##was 4 | |
MAX_VEL_2 = 9/9 * np.pi #was 9 | |
AVAIL_TORQUE = [-1., 0., +1] | |
torque_noise_max = 0. | |
#: use dynamics equations from the nips paper or the book | |
book_or_nips = "book" | |
action_arrow = None | |
domain_fig = None | |
actions_num = 3 | |
def __init__(self): | |
self.viewer = None | |
high = np.array([1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2]) | |
low = -high | |
self.observation_space = spaces.Box(low, high) | |
self.action_space = spaces.Discrete(3) | |
self._seed() | |
self.steps_beyond_done = 0 | |
self.rewardx = -1 | |
def _seed(self, seed=None): | |
self.np_random, seed = seeding.np_random(seed) | |
return [seed] | |
def _reset(self): | |
self.state = self.np_random.uniform(low=-0.1, high=0.1, size=(4,)) | |
return self._get_ob() | |
def _step(self, a): | |
s = self.state | |
torque = self.AVAIL_TORQUE[a] | |
if self.steps_beyond_done == None: | |
self.steps_beyond_done == 0 | |
#self.steps_beyond_done == 1 | |
# Add noise to the force action | |
if self.torque_noise_max > 0: | |
torque += self.np_random.uniform(-self.torque_noise_max, self.torque_noise_max) | |
# Now, augment the state with our force action so it can be passed to | |
# _dsdt | |
s_augmented = np.append(s, torque) | |
ns = rk4(self._dsdt, s_augmented, [0, self.dt]) | |
# only care about final timestep of integration returned by integrator | |
ns = ns[-1] | |
ns = ns[:4] # omit action | |
# ODEINT IS TOO SLOW! | |
# ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, [0, self.dt]) | |
# self.s_continuous = ns_continuous[-1] # We only care about the state | |
# at the ''final timestep'', self.dt | |
ns[0] = wrap(ns[0], -pi, pi) | |
ns[1] = wrap(ns[1], -pi, pi) | |
ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1) | |
ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2) | |
self.state = ns | |
#terminal = self._terminal() | |
#reward = bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) | |
#if reward > 1: | |
# terminal | |
self.rewardx = (-np.cos(s[0]) - np.cos(s[1] + s[0])) ##Swung height is calculataed | |
if self.rewardx < .5: | |
reward = -1. | |
self.steps_beyond_done = 0 | |
if (self.rewardx > .5 and self.rewardx < .8): | |
reward = -0.8 | |
self.steps_beyond_done = 0 | |
if self.rewardx > .8: | |
reward = -0.6 | |
if self.rewardx < .9: | |
self.steps_beyond_done = 0 | |
if self.rewardx > 1: | |
reward = -0.4 | |
#print (self.rewardx) | |
self.steps_beyond_done += 1 | |
#print (self.steps_beyond_done) | |
if self.steps_beyond_done > 4: | |
reward = -0.2 | |
if self.steps_beyond_done > 8: | |
reward = -0.1 | |
#print (self.rewardx) | |
#print (self.steps_beyond_done) | |
if self.steps_beyond_done > 20: | |
reward = -0.05 | |
if self.steps_beyond_done > 60: | |
reward = 0. | |
#print (self.rewardx) | |
print (self.steps_beyond_done) | |
terminal = self._terminal() | |
#reward = -1. if not terminal else 0. | |
return (self._get_ob(), reward, terminal, {}) | |
def _get_ob(self): | |
s = self.state | |
return np.array([cos(s[0]), np.sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]]) | |
def _terminal(self): | |
#s = self.state | |
sreward = self.rewardx | |
sd = self.steps_beyond_done ##last change | |
#reward1 = (-np.cos(s[0]) - np.cos(s[1] + s[0])) | |
#if self.steps_beyond_done == 2: | |
# print ("terminal - 2") | |
#return bool(-np.cos(s[0]) - np.cos(s[1] + s[0]) > 1.) | |
return bool(sd > 60) | |
def _dsdt(self, s_augmented, t): | |
m1 = self.LINK_MASS_1 | |
m2 = self.LINK_MASS_2 | |
l1 = self.LINK_LENGTH_1 | |
lc1 = self.LINK_COM_POS_1 | |
lc2 = self.LINK_COM_POS_2 | |
I1 = self.LINK_MOI | |
I2 = self.LINK_MOI | |
g = 0 #was 9.8 | |
a = s_augmented[-1] | |
s = s_augmented[:-1] | |
theta1 = s[0] | |
theta2 = s[1] | |
dtheta1 = s[2] | |
dtheta2 = s[3] | |
d1 = m1 * lc1 ** 2 + m2 * \ | |
(l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * np.cos(theta2)) + I1 + I2 | |
d2 = m2 * (lc2 ** 2 + l1 * lc2 * np.cos(theta2)) + I2 | |
phi2 = m2 * lc2 * g * np.cos(theta1 + theta2 - np.pi / 2.) | |
phi1 = - m2 * l1 * lc2 * dtheta2 ** 2 * np.sin(theta2) \ | |
- 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * np.sin(theta2) \ | |
+ (m1 * lc1 + m2 * l1) * g * np.cos(theta1 - np.pi / 2) + phi2 | |
if self.book_or_nips == "nips": | |
# the following line is consistent with the description in the | |
# paper | |
ddtheta2 = (a + d2 / d1 * phi1 - phi2) / \ | |
(m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) | |
else: | |
# the following line is consistent with the java implementation and the | |
# book | |
ddtheta2 = (a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 * np.sin(theta2) - phi2) \ | |
/ (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) | |
ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 | |
return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.) | |
def _render(self, mode='human', close=False): | |
if close: | |
if self.viewer is not None: | |
self.viewer.close() | |
self.viewer = None | |
return | |
from gym.envs.classic_control import rendering | |
s = self.state | |
if self.viewer is None: | |
self.viewer = rendering.Viewer(500,500) | |
self.viewer.set_bounds(-2.2,2.2,-2.2,2.2) | |
p1 = [-self.LINK_LENGTH_1 * | |
np.cos(s[0]), self.LINK_LENGTH_1 * np.sin(s[0])] | |
p2 = [p1[0] - self.LINK_LENGTH_2 * np.cos(s[0] + s[1]), | |
p1[1] + self.LINK_LENGTH_2 * np.sin(s[0] + s[1])] | |
xys = np.array([[0,0], p1, p2])[:,::-1] | |
thetas = [s[0]-np.pi/2, s[0]+s[1]-np.pi/2] | |
self.viewer.draw_line((-2.2, 0), (2.2, 0)) | |
for ((x,y),th) in zip(xys, thetas): | |
l,r,t,b = 0, 1, .1, -.1 | |
jtransform = rendering.Transform(rotation=th, translation=(x,y)) | |
link = self.viewer.draw_polygon([(l,b), (l,t), (r,t), (r,b)]) | |
link.add_attr(jtransform) | |
link.set_color(0,.8, .8) | |
circ = self.viewer.draw_circle(.1) | |
circ.set_color(.8, .8, 0) | |
circ.add_attr(jtransform) | |
return self.viewer.render(return_rgb_array = mode=='rgb_array') | |
def wrap(x, m, M): | |
""" | |
:param x: a scalar | |
:param m: minimum possible value in range | |
:param M: maximum possible value in range | |
Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which | |
truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n | |
For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. | |
""" | |
diff = M - m | |
while x > M: | |
x = x - diff | |
while x < m: | |
x = x + diff | |
return x | |
def bound(x, m, M=None): | |
""" | |
:param x: scalar | |
Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* | |
have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1]. | |
""" | |
if M is None: | |
M = m[1] | |
m = m[0] | |
# bound x between min (m) and Max (M) | |
return min(max(x, m), M) | |
def rk4(derivs, y0, t, *args, **kwargs): | |
""" | |
Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta. | |
This is a toy implementation which may be useful if you find | |
yourself stranded on a system w/o scipy. Otherwise use | |
:func:`scipy.integrate`. | |
*y0* | |
initial state vector | |
*t* | |
sample times | |
*derivs* | |
returns the derivative of the system and has the | |
signature ``dy = derivs(yi, ti)`` | |
*args* | |
additional arguments passed to the derivative function | |
*kwargs* | |
additional keyword arguments passed to the derivative function | |
Example 1 :: | |
## 2D system | |
def derivs6(x,t): | |
d1 = x[0] + 2*x[1] | |
d2 = -3*x[0] + 4*x[1] | |
return (d1, d2) | |
dt = 0.0005 | |
t = arange(0.0, 2.0, dt) | |
y0 = (1,2) | |
yout = rk4(derivs6, y0, t) | |
Example 2:: | |
## 1D system | |
alpha = 2 | |
def derivs(x,t): | |
return -alpha*x + exp(-t) | |
y0 = 1 | |
yout = rk4(derivs, y0, t) | |
If you have access to scipy, you should probably be using the | |
scipy.integrate tools rather than this function. | |
""" | |
try: | |
Ny = len(y0) | |
except TypeError: | |
yout = np.zeros((len(t),), np.float_) | |
else: | |
yout = np.zeros((len(t), Ny), np.float_) | |
yout[0] = y0 | |
i = 0 | |
for i in np.arange(len(t) - 1): | |
thist = t[i] | |
dt = t[i + 1] - thist | |
dt2 = dt / 2.0 | |
y0 = yout[i] | |
k1 = np.asarray(derivs(y0, thist, *args, **kwargs)) | |
k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs)) | |
k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs)) | |
k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs)) | |
yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) | |
return yout |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment