Skip to content

Instantly share code, notes, and snippets.

@SalahEddineGhamri
Created December 7, 2025 01:00
Show Gist options
  • Select an option

  • Save SalahEddineGhamri/798a797d89dca834db6bca56b0aad5a5 to your computer and use it in GitHub Desktop.

Select an option

Save SalahEddineGhamri/798a797d89dca834db6bca56b0aad5a5 to your computer and use it in GitHub Desktop.
a simple q learning implementation in python
"""
Simple Q learning algorithm with epsilon-greedy action choice
in q-learning algorithm we maintain a table of stats-actions q-values
after attributing the rewards to the goal states
the table is update based on bellman learning formula
to propagate the q values on the state-action back to the start
learning is done through episodes
in the end the result path is calculated from the learned policy
Author: Salah Eddine Ghamri
"""
import numpy as np
import random
# Define the grid world environment
# Obstacles are designated using "1"
grid = [
[0, 0, 0, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 0, 1, 0],
[0, 1, 0, 0, 0],
[0, 0, 0, 0, 0],
]
# start and goal points
start = (0, 0)
goal = (3, 2)
# Define the possible actions
actions = [(-1, 0), (1, 0), (0, -1), (0, 1)] # down # up # left # right
# Define the Q-table
Q = {}
for i in range(5):
for j in range(5):
for a in actions:
Q[(i, j), a] = 0
# Define the hyperparameters
alpha = 0.1 # learning rate
gamma = 0.9 # discount factor
epsilon = 0.1 # epsilon-greedy rate
num_episodes = 1000 # number of episodes
# Q-Learning algorithm
for episode in range(num_episodes):
state = start
done = False
while not done:
# Choose an action using epsilon-greedy policy
if np.random.random() < epsilon:
action = random.choice(actions)
else:
q_max = max(
Q[state, a]
for a in actions
if (
state[0] + a[0] >= 0
and state[0] + a[0] < 5
and state[1] + a[1] >= 0
and state[1] + a[1] < 5
)
and grid[state[0] + a[0]][state[1] + a[1]] != 1
)
action = next((a for a in actions if Q[state, a] == q_max))
# Check if the action leads to an invalid state
if (
not (
state[0] + action[0] >= 0
and state[0] + action[0] < 5
and state[1] + action[1] >= 0
and state[1] + action[1] < 5
)
or grid[state[0] + action[0]][state[1] + action[1]] == 1
):
continue
# Take the action and observe the reward and new state
next_state = (state[0] + action[0], state[1] + action[1])
# lowest reward
reward = -1
if next_state == goal:
# highest reward
reward = 100
done = True
# update the Q-table
Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (
reward + gamma * max(Q[next_state, a] for a in actions)
)
# update next state
state = next_state
# Print the optimal policy
policy = {}
for state, action in Q:
q_max = max(Q[state, a] for a in actions)
action = next((a for a in actions if Q[state, a] == q_max))
policy[state] = action
print("learned policy: ", policy)
# Build path from learned policy
path = []
current_state = start
path.append(start)
while current_state != goal:
current_state = (
current_state[0] + policy[current_state][0],
current_state[1] + policy[current_state][1],
)
path.append(current_state)
print("path to goal : ", path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment