Skip to content

Instantly share code, notes, and snippets.

@fsndzomga
Created September 7, 2023 16:17
Show Gist options
  • Save fsndzomga/5eac54df171f17248a7cb06575d8dac3 to your computer and use it in GitHub Desktop.
Save fsndzomga/5eac54df171f17248a7cb06575d8dac3 to your computer and use it in GitHub Desktop.
rl simple
import numpy as np
import pandas as pd
import time
np.random.seed(2) # reproducible
ROWS, COLS = 2, 2 # dimensions of the 2D world
ACTIONS = ['left', 'right', 'up', 'down'] # available actions
EPSILON = 0.9 # greedy policy
ALPHA = 0.1 # learning rate
GAMMA = 0.9 # discount factor
MAX_EPISODES = 100 # maximum episodes
FRESH_TIME = 0.3 # fresh time for one move
def build_q_table(rows, cols, actions):
state_size = rows * cols
table = pd.DataFrame(
np.zeros((state_size, len(actions))),
columns=actions,
)
return table
def choose_action(state, q_table):
state_actions = q_table.iloc[state, :]
if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):
action_name = np.random.choice(ACTIONS)
else:
action_name = state_actions.idxmax()
return action_name
def get_env_feedback(S, A, grid):
row, col = divmod(S, COLS)
R = 0
if A == 'up' and row > 0:
R = -1 if grid[row][col] < grid[row-1][col] else 1
grid[row][col], grid[row-1][col] = grid[row-1][col], grid[row][col]
S_ = S - COLS
elif A == 'down' and row < ROWS - 1:
R = -1 if grid[row][col] < grid[row+1][col] else 1
grid[row][col], grid[row+1][col] = grid[row+1][col], grid[row][col]
S_ = S + COLS
elif A == 'left' and col > 0:
R = -1 if grid[row][col] < grid[row][col-1] else 1
grid[row][col], grid[row][col-1] = grid[row][col-1], grid[row][col]
S_ = S - 1
elif A == 'right' and col < COLS - 1:
R = -1 if grid[row][col] < grid[row][col+1] else 1
grid[row][col], grid[row][col+1] = grid[row][col+1], grid[row][col]
S_ = S + 1
else:
S_ = S
return S_, R
def update_env(grid, episode, step_counter):
print(f'\rEpisode {episode+1}: total_steps = {step_counter}')
for row in grid:
print(' '.join(map(str, row)))
time.sleep(FRESH_TIME)
def rl():
q_table = build_q_table(ROWS, COLS, ACTIONS)
for episode in range(MAX_EPISODES):
step_counter = 0
grid = np.random.choice(range(1, ROWS*COLS+1), (ROWS, COLS), replace=False)
S = np.random.randint(0, ROWS * COLS)
update_env(grid, episode, step_counter)
while True:
A = choose_action(S, q_table)
S_, R = get_env_feedback(S, A, grid)
q_predict = q_table.loc[S, A]
q_target = R + GAMMA * q_table.iloc[S_, :].max()
q_table.loc[S, A] += ALPHA * (q_target - q_predict)
S = S_
step_counter += 1
update_env(grid, episode, step_counter)
# Check if the grid is sorted to terminate
if np.all(grid == np.sort(grid, axis=None).reshape(ROWS, COLS)):
break
return q_table
if __name__ == "__main__":
q_table = rl()
print('\r\nQ-table:\n')
print(q_table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment