Skip to content

Instantly share code, notes, and snippets.

@jadijadi
Created March 6, 2025 06:26
Show Gist options
  • Save jadijadi/012e1aa5f255f3c4477bd8bc1eebc76e to your computer and use it in GitHub Desktop.
Save jadijadi/012e1aa5f255f3c4477bd8bc1eebc76e to your computer and use it in GitHub Desktop.
A reinforcement learning sample using q-learning & Open AI Gymnasium
import gymnasium as gym
import numpy as np
# Discretize the continuous state space
def discretize_state(state):
# Define bins for each state component
pos_x_bins = np.linspace(-1.5, 1.5, 10)
pos_y_bins = np.linspace(-1.5, 1.5, 10)
vel_x_bins = np.linspace(-2, 2, 10)
vel_y_bins = np.linspace(-2, 2, 10)
angle_bins = np.linspace(-3.14, 3.14, 10)
angular_vel_bins = np.linspace(-5, 5, 10)
leg_bins = [0, 1] # Binary states for legs
# Discretize each component
discrete_state = (
np.digitize(state[0], pos_x_bins),
np.digitize(state[1], pos_y_bins),
np.digitize(state[2], vel_x_bins),
np.digitize(state[3], vel_y_bins),
np.digitize(state[4], angle_bins),
np.digitize(state[5], angular_vel_bins),
state[6], # Left leg (already binary)
state[7] # Right leg (already binary)
)
return discrete_state
# Initialize Q-learning parameters
learning_rate = 0.1
discount_factor = 0.99
epsilon = 1.0
epsilon_decay = 0.995
epsilon_min = 0.01
# Initialize Q-table
q_table = {}
# Initialise the environment
env = gym.make("LunarLander-v3", render_mode="human")
# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)
total_reward = 0
episode = 0
for _ in range(10000):
# Discretize the state
state = discretize_state(observation)
# If state is not in Q-table, add it
if state not in q_table:
q_table[state] = np.zeros(env.action_space.n)
# Epsilon-greedy action selection
if np.random.random() < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(q_table[state])
# Step through the environment
next_observation, reward, terminated, truncated, info = env.step(action)
total_reward += reward
# Discretize next state
next_state = discretize_state(next_observation)
# If next_state is not in Q-table, add it
if next_state not in q_table:
q_table[next_state] = np.zeros(env.action_space.n)
# Q-learning update
old_value = q_table[state][action]
next_max = np.max(q_table[next_state])
new_value = (1 - learning_rate) * old_value + learning_rate * (reward + discount_factor * next_max)
q_table[state][action] = new_value
# Update observation
observation = next_observation
# If the episode has ended
if terminated or truncated:
episode += 1
print(f"Episode {episode} finished with total reward: {total_reward}")
total_reward = 0
observation, info = env.reset()
# Decay epsilon
epsilon = max(epsilon_min, epsilon * epsilon_decay)
env.close()
@SultanARIA1387
Copy link

nice

@alicheraghi
Copy link

๐Ÿ‘๐Ÿ™

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment