Created
December 7, 2025 01:00
-
-
Save SalahEddineGhamri/798a797d89dca834db6bca56b0aad5a5 to your computer and use it in GitHub Desktop.
a simple q learning implementation in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Simple Q learning algorithm with epsilon-greedy action choice | |
| in q-learning algorithm we maintain a table of stats-actions q-values | |
| after attributing the rewards to the goal states | |
| the table is update based on bellman learning formula | |
| to propagate the q values on the state-action back to the start | |
| learning is done through episodes | |
| in the end the result path is calculated from the learned policy | |
| Author: Salah Eddine Ghamri | |
| """ | |
| import numpy as np | |
| import random | |
| # Define the grid world environment | |
| # Obstacles are designated using "1" | |
| grid = [ | |
| [0, 0, 0, 0, 0], | |
| [0, 1, 1, 1, 0], | |
| [0, 0, 0, 1, 0], | |
| [0, 1, 0, 0, 0], | |
| [0, 0, 0, 0, 0], | |
| ] | |
| # start and goal points | |
| start = (0, 0) | |
| goal = (3, 2) | |
| # Define the possible actions | |
| actions = [(-1, 0), (1, 0), (0, -1), (0, 1)] # down # up # left # right | |
| # Define the Q-table | |
| Q = {} | |
| for i in range(5): | |
| for j in range(5): | |
| for a in actions: | |
| Q[(i, j), a] = 0 | |
| # Define the hyperparameters | |
| alpha = 0.1 # learning rate | |
| gamma = 0.9 # discount factor | |
| epsilon = 0.1 # epsilon-greedy rate | |
| num_episodes = 1000 # number of episodes | |
| # Q-Learning algorithm | |
| for episode in range(num_episodes): | |
| state = start | |
| done = False | |
| while not done: | |
| # Choose an action using epsilon-greedy policy | |
| if np.random.random() < epsilon: | |
| action = random.choice(actions) | |
| else: | |
| q_max = max( | |
| Q[state, a] | |
| for a in actions | |
| if ( | |
| state[0] + a[0] >= 0 | |
| and state[0] + a[0] < 5 | |
| and state[1] + a[1] >= 0 | |
| and state[1] + a[1] < 5 | |
| ) | |
| and grid[state[0] + a[0]][state[1] + a[1]] != 1 | |
| ) | |
| action = next((a for a in actions if Q[state, a] == q_max)) | |
| # Check if the action leads to an invalid state | |
| if ( | |
| not ( | |
| state[0] + action[0] >= 0 | |
| and state[0] + action[0] < 5 | |
| and state[1] + action[1] >= 0 | |
| and state[1] + action[1] < 5 | |
| ) | |
| or grid[state[0] + action[0]][state[1] + action[1]] == 1 | |
| ): | |
| continue | |
| # Take the action and observe the reward and new state | |
| next_state = (state[0] + action[0], state[1] + action[1]) | |
| # lowest reward | |
| reward = -1 | |
| if next_state == goal: | |
| # highest reward | |
| reward = 100 | |
| done = True | |
| # update the Q-table | |
| Q[state, action] = (1 - alpha) * Q[state, action] + alpha * ( | |
| reward + gamma * max(Q[next_state, a] for a in actions) | |
| ) | |
| # update next state | |
| state = next_state | |
| # Print the optimal policy | |
| policy = {} | |
| for state, action in Q: | |
| q_max = max(Q[state, a] for a in actions) | |
| action = next((a for a in actions if Q[state, a] == q_max)) | |
| policy[state] = action | |
| print("learned policy: ", policy) | |
| # Build path from learned policy | |
| path = [] | |
| current_state = start | |
| path.append(start) | |
| while current_state != goal: | |
| current_state = ( | |
| current_state[0] + policy[current_state][0], | |
| current_state[1] + policy[current_state][1], | |
| ) | |
| path.append(current_state) | |
| print("path to goal : ", path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment