SalahEddineGhamri · December 7, 2025 01:00
diff --git a/simple_q_learning.py b/simple_q_learning.py
 """
 Simple Q learning algorithm with epsilon-greedy action choice

 in q-learning algorithm we maintain a table of stats-actions q-values
 after attributing the rewards to the goal states
 the table is update based on bellman learning formula
 to propagate the q values on the state-action back to the start

 learning is done through episodes
 in the end the result path is calculated from the learned policy

 Author: Salah Eddine Ghamri
 """

 import numpy as np
 import random

 # Define the grid world environment
 # Obstacles are designated using "1"
 grid = [
    [0, 0, 0, 0, 0],
    [0, 1, 1, 1, 0],
    [0, 0, 0, 1, 0],
    [0, 1, 0, 0, 0],
    [0, 0, 0, 0, 0],
 ]

 # start and goal points
 start = (0, 0)
 goal = (3, 2)

 # Define the possible actions
 actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # down  # up  # left  # right

 # Define the Q-table
 Q = {}
 for i in range(5):
    for j in range(5):
        for a in actions:
            Q[(i, j), a] = 0

 # Define the hyperparameters
 alpha = 0.1  # learning rate
 gamma = 0.9  # discount factor
 epsilon = 0.1  # epsilon-greedy rate
 num_episodes = 1000  # number of episodes

 # Q-Learning algorithm
 for episode in range(num_episodes):
    state = start
    done = False
    while not done:
        # Choose an action using epsilon-greedy policy
        if np.random.random() < epsilon:
            action = random.choice(actions)
        else:
            q_max = max(
                Q[state, a]
                for a in actions
                if (
                    state[0] + a[0] >= 0
                    and state[0] + a[0] < 5
                    and state[1] + a[1] >= 0
                    and state[1] + a[1] < 5
                )
                and grid[state[0] + a[0]][state[1] + a[1]] != 1
            )
            action = next((a for a in actions if Q[state, a] == q_max))

        # Check if the action leads to an invalid state
        if (
            not (
                state[0] + action[0] >= 0
                and state[0] + action[0] < 5
                and state[1] + action[1] >= 0
                and state[1] + action[1] < 5
            )
            or grid[state[0] + action[0]][state[1] + action[1]] == 1
        ):
            continue

        # Take the action and observe the reward and new state
        next_state = (state[0] + action[0], state[1] + action[1])

        # lowest reward
        reward = -1

        if next_state == goal:
            # highest reward
            reward = 100
            done = True

        # update the Q-table
        Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (
            reward + gamma * max(Q[next_state, a] for a in actions)
        )

        # update next state
        state = next_state

 # Print the optimal policy
 policy = {}
 for state, action in Q:
    q_max = max(Q[state, a] for a in actions)
    action = next((a for a in actions if Q[state, a] == q_max))
    policy[state] = action
 print("learned policy: ", policy)

 # Build path from learned policy
 path = []
 current_state = start
 path.append(start)

 while current_state != goal:
    current_state = (
        current_state[0] + policy[current_state][0],
        current_state[1] + policy[current_state][1],
    )
    path.append(current_state)

 print("path to goal : ", path)
	"""
	Simple Q learning algorithm with epsilon-greedy action choice

	in q-learning algorithm we maintain a table of stats-actions q-values
	after attributing the rewards to the goal states
	the table is update based on bellman learning formula
	to propagate the q values on the state-action back to the start

	learning is done through episodes
	in the end the result path is calculated from the learned policy

	Author: Salah Eddine Ghamri
	"""

	import numpy as np
	import random

	# Define the grid world environment
	# Obstacles are designated using "1"
	grid = [
	[0, 0, 0, 0, 0],
	[0, 1, 1, 1, 0],
	[0, 0, 0, 1, 0],
	[0, 1, 0, 0, 0],
	[0, 0, 0, 0, 0],
	]

	# start and goal points
	start = (0, 0)
	goal = (3, 2)

	# Define the possible actions
	actions = [(-1, 0), (1, 0), (0, -1), (0, 1)] # down # up # left # right

	# Define the Q-table
	Q = {}
	for i in range(5):
	for j in range(5):
	for a in actions:
	Q[(i, j), a] = 0

	# Define the hyperparameters
	alpha = 0.1 # learning rate
	gamma = 0.9 # discount factor
	epsilon = 0.1 # epsilon-greedy rate
	num_episodes = 1000 # number of episodes

	# Q-Learning algorithm
	for episode in range(num_episodes):
	state = start
	done = False
	while not done:
	# Choose an action using epsilon-greedy policy
	if np.random.random() < epsilon:
	action = random.choice(actions)
	else:
	q_max = max(
	Q[state, a]
	for a in actions
	if (
	state[0] + a[0] >= 0
	and state[0] + a[0] < 5
	and state[1] + a[1] >= 0
	and state[1] + a[1] < 5
	)
	and grid[state[0] + a[0]][state[1] + a[1]] != 1
	)
	action = next((a for a in actions if Q[state, a] == q_max))

	# Check if the action leads to an invalid state
	if (
	not (
	state[0] + action[0] >= 0
	and state[0] + action[0] < 5
	and state[1] + action[1] >= 0
	and state[1] + action[1] < 5
	)
	or grid[state[0] + action[0]][state[1] + action[1]] == 1
	):
	continue

	# Take the action and observe the reward and new state
	next_state = (state[0] + action[0], state[1] + action[1])

	# lowest reward
	reward = -1

	if next_state == goal:
	# highest reward
	reward = 100
	done = True

	# update the Q-table
	Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (
	reward + gamma * max(Q[next_state, a] for a in actions)
	)

	# update next state
	state = next_state

	# Print the optimal policy
	policy = {}
	for state, action in Q:
	q_max = max(Q[state, a] for a in actions)
	action = next((a for a in actions if Q[state, a] == q_max))
	policy[state] = action
	print("learned policy: ", policy)

	# Build path from learned policy
	path = []
	current_state = start
	path.append(start)

	while current_state != goal:
	current_state = (
	current_state[0] + policy[current_state][0],
	current_state[1] + policy[current_state][1],
	)
	path.append(current_state)

	print("path to goal : ", path)
No results found