Created
March 6, 2025 06:26
-
-
Save jadijadi/012e1aa5f255f3c4477bd8bc1eebc76e to your computer and use it in GitHub Desktop.
A reinforcement learning sample using q-learning & Open AI Gymnasium
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gymnasium as gym | |
import numpy as np | |
# Discretize the continuous state space | |
def discretize_state(state): | |
# Define bins for each state component | |
pos_x_bins = np.linspace(-1.5, 1.5, 10) | |
pos_y_bins = np.linspace(-1.5, 1.5, 10) | |
vel_x_bins = np.linspace(-2, 2, 10) | |
vel_y_bins = np.linspace(-2, 2, 10) | |
angle_bins = np.linspace(-3.14, 3.14, 10) | |
angular_vel_bins = np.linspace(-5, 5, 10) | |
leg_bins = [0, 1] # Binary states for legs | |
# Discretize each component | |
discrete_state = ( | |
np.digitize(state[0], pos_x_bins), | |
np.digitize(state[1], pos_y_bins), | |
np.digitize(state[2], vel_x_bins), | |
np.digitize(state[3], vel_y_bins), | |
np.digitize(state[4], angle_bins), | |
np.digitize(state[5], angular_vel_bins), | |
state[6], # Left leg (already binary) | |
state[7] # Right leg (already binary) | |
) | |
return discrete_state | |
# Initialize Q-learning parameters | |
learning_rate = 0.1 | |
discount_factor = 0.99 | |
epsilon = 1.0 | |
epsilon_decay = 0.995 | |
epsilon_min = 0.01 | |
# Initialize Q-table | |
q_table = {} | |
# Initialise the environment | |
env = gym.make("LunarLander-v3", render_mode="human") | |
# Reset the environment to generate the first observation | |
observation, info = env.reset(seed=42) | |
total_reward = 0 | |
episode = 0 | |
for _ in range(10000): | |
# Discretize the state | |
state = discretize_state(observation) | |
# If state is not in Q-table, add it | |
if state not in q_table: | |
q_table[state] = np.zeros(env.action_space.n) | |
# Epsilon-greedy action selection | |
if np.random.random() < epsilon: | |
action = env.action_space.sample() | |
else: | |
action = np.argmax(q_table[state]) | |
# Step through the environment | |
next_observation, reward, terminated, truncated, info = env.step(action) | |
total_reward += reward | |
# Discretize next state | |
next_state = discretize_state(next_observation) | |
# If next_state is not in Q-table, add it | |
if next_state not in q_table: | |
q_table[next_state] = np.zeros(env.action_space.n) | |
# Q-learning update | |
old_value = q_table[state][action] | |
next_max = np.max(q_table[next_state]) | |
new_value = (1 - learning_rate) * old_value + learning_rate * (reward + discount_factor * next_max) | |
q_table[state][action] = new_value | |
# Update observation | |
observation = next_observation | |
# If the episode has ended | |
if terminated or truncated: | |
episode += 1 | |
print(f"Episode {episode} finished with total reward: {total_reward}") | |
total_reward = 0 | |
observation, info = env.reset() | |
# Decay epsilon | |
epsilon = max(epsilon_min, epsilon * epsilon_decay) | |
env.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
nice