jadijadi · March 6, 2025 06:26 · SultanARIA1387 · Mar 7, 2025 · alicheraghi · Mar 10, 2025
diff --git a/reinforcement_learning.py b/reinforcement_learning.py
 import gymnasium as gym
 import numpy as np

 # Discretize the continuous state space
 def discretize_state(state):
    # Define bins for each state component
    pos_x_bins = np.linspace(-1.5, 1.5, 10)
    pos_y_bins = np.linspace(-1.5, 1.5, 10)
    vel_x_bins = np.linspace(-2, 2, 10)
    vel_y_bins = np.linspace(-2, 2, 10)
    angle_bins = np.linspace(-3.14, 3.14, 10)
    angular_vel_bins = np.linspace(-5, 5, 10)
    leg_bins = [0, 1]  # Binary states for legs
    
    # Discretize each component
    discrete_state = (
        np.digitize(state[0], pos_x_bins),
        np.digitize(state[1], pos_y_bins),
        np.digitize(state[2], vel_x_bins),
        np.digitize(state[3], vel_y_bins),
        np.digitize(state[4], angle_bins),
        np.digitize(state[5], angular_vel_bins),
        state[6],  # Left leg (already binary)
        state[7]   # Right leg (already binary)
    )
    return discrete_state

 # Initialize Q-learning parameters
 learning_rate = 0.1
 discount_factor = 0.99
 epsilon = 1.0
 epsilon_decay = 0.995
 epsilon_min = 0.01

 # Initialize Q-table
 q_table = {}

 # Initialise the environment
 env = gym.make("LunarLander-v3", render_mode="human")

 # Reset the environment to generate the first observation
 observation, info = env.reset(seed=42)
 total_reward = 0
 episode = 0

 for _ in range(10000):
    # Discretize the state
    state = discretize_state(observation)
    
    # If state is not in Q-table, add it
    if state not in q_table:
        q_table[state] = np.zeros(env.action_space.n)
    
    # Epsilon-greedy action selection
    if np.random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state])

    # Step through the environment
    next_observation, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    
    # Discretize next state
    next_state = discretize_state(next_observation)
    
    # If next_state is not in Q-table, add it
    if next_state not in q_table:
        q_table[next_state] = np.zeros(env.action_space.n)
    
    # Q-learning update
    old_value = q_table[state][action]
    next_max = np.max(q_table[next_state])
    new_value = (1 - learning_rate) * old_value + learning_rate * (reward + discount_factor * next_max)
    q_table[state][action] = new_value
    
    # Update observation
    observation = next_observation

    # If the episode has ended
    if terminated or truncated:
        episode += 1
        print(f"Episode {episode} finished with total reward: {total_reward}")
        total_reward = 0
        observation, info = env.reset()
        
        # Decay epsilon
        epsilon = max(epsilon_min, epsilon * epsilon_decay)

 env.close()
	import gymnasium as gym
	import numpy as np

	# Discretize the continuous state space
	def discretize_state(state):
	# Define bins for each state component
	pos_x_bins = np.linspace(-1.5, 1.5, 10)
	pos_y_bins = np.linspace(-1.5, 1.5, 10)
	vel_x_bins = np.linspace(-2, 2, 10)
	vel_y_bins = np.linspace(-2, 2, 10)
	angle_bins = np.linspace(-3.14, 3.14, 10)
	angular_vel_bins = np.linspace(-5, 5, 10)
	leg_bins = [0, 1] # Binary states for legs

	# Discretize each component
	discrete_state = (
	np.digitize(state[0], pos_x_bins),
	np.digitize(state[1], pos_y_bins),
	np.digitize(state[2], vel_x_bins),
	np.digitize(state[3], vel_y_bins),
	np.digitize(state[4], angle_bins),
	np.digitize(state[5], angular_vel_bins),
	state[6], # Left leg (already binary)
	state[7] # Right leg (already binary)
	)
	return discrete_state

	# Initialize Q-learning parameters
	learning_rate = 0.1
	discount_factor = 0.99
	epsilon = 1.0
	epsilon_decay = 0.995
	epsilon_min = 0.01

	# Initialize Q-table
	q_table = {}

	# Initialise the environment
	env = gym.make("LunarLander-v3", render_mode="human")

	# Reset the environment to generate the first observation
	observation, info = env.reset(seed=42)
	total_reward = 0
	episode = 0

	for _ in range(10000):
	# Discretize the state
	state = discretize_state(observation)

	# If state is not in Q-table, add it
	if state not in q_table:
	q_table[state] = np.zeros(env.action_space.n)

	# Epsilon-greedy action selection
	if np.random.random() < epsilon:
	action = env.action_space.sample()
	else:
	action = np.argmax(q_table[state])

	# Step through the environment
	next_observation, reward, terminated, truncated, info = env.step(action)
	total_reward += reward

	# Discretize next state
	next_state = discretize_state(next_observation)

	# If next_state is not in Q-table, add it
	if next_state not in q_table:
	q_table[next_state] = np.zeros(env.action_space.n)

	# Q-learning update
	old_value = q_table[state][action]
	next_max = np.max(q_table[next_state])
	new_value = (1 - learning_rate) * old_value + learning_rate * (reward + discount_factor * next_max)
	q_table[state][action] = new_value

	# Update observation
	observation = next_observation

	# If the episode has ended
	if terminated or truncated:
	episode += 1
	print(f"Episode {episode} finished with total reward: {total_reward}")
	total_reward = 0
	observation, info = env.reset()

	# Decay epsilon
	epsilon = max(epsilon_min, epsilon * epsilon_decay)

	env.close()