fsndzomga · September 7, 2023 16:17
diff --git a/rlSimple.py b/rlSimple.py
 import numpy as np
 import pandas as pd
 import time

 np.random.seed(2)  # reproducible

 ROWS, COLS = 2, 2  # dimensions of the 2D world
 ACTIONS = ['left', 'right', 'up', 'down']  # available actions
 EPSILON = 0.9  # greedy policy
 ALPHA = 0.1  # learning rate
 GAMMA = 0.9  # discount factor
 MAX_EPISODES = 100  # maximum episodes
 FRESH_TIME = 0.3  # fresh time for one move

 def build_q_table(rows, cols, actions):
    state_size = rows * cols
    table = pd.DataFrame(
        np.zeros((state_size, len(actions))),
        columns=actions,
    )
    return table

 def choose_action(state, q_table):
    state_actions = q_table.iloc[state, :]
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):
        action_name = np.random.choice(ACTIONS)
    else:
        action_name = state_actions.idxmax()
    return action_name

 def get_env_feedback(S, A, grid):
    row, col = divmod(S, COLS)
    R = 0

    if A == 'up' and row > 0:
        R = -1 if grid[row][col] < grid[row-1][col] else 1
        grid[row][col], grid[row-1][col] = grid[row-1][col], grid[row][col]
        S_ = S - COLS

    elif A == 'down' and row < ROWS - 1:
        R = -1 if grid[row][col] < grid[row+1][col] else 1
        grid[row][col], grid[row+1][col] = grid[row+1][col], grid[row][col]
        S_ = S + COLS

    elif A == 'left' and col > 0:
        R = -1 if grid[row][col] < grid[row][col-1] else 1
        grid[row][col], grid[row][col-1] = grid[row][col-1], grid[row][col]
        S_ = S - 1

    elif A == 'right' and col < COLS - 1:
        R = -1 if grid[row][col] < grid[row][col+1] else 1
        grid[row][col], grid[row][col+1] = grid[row][col+1], grid[row][col]
        S_ = S + 1

    else:
        S_ = S

    return S_, R

 def update_env(grid, episode, step_counter):
    print(f'\rEpisode {episode+1}: total_steps = {step_counter}')
    for row in grid:
        print(' '.join(map(str, row)))
    time.sleep(FRESH_TIME)

 def rl():
    q_table = build_q_table(ROWS, COLS, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        grid = np.random.choice(range(1, ROWS*COLS+1), (ROWS, COLS), replace=False)
        S = np.random.randint(0, ROWS * COLS)

        update_env(grid, episode, step_counter)

        while True:
            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A, grid)

            q_predict = q_table.loc[S, A]
            q_target = R + GAMMA * q_table.iloc[S_, :].max()
            
            q_table.loc[S, A] += ALPHA * (q_target - q_predict)
            S = S_

            step_counter += 1
            update_env(grid, episode, step_counter)

            # Check if the grid is sorted to terminate
            if np.all(grid == np.sort(grid, axis=None).reshape(ROWS, COLS)):
                break

    return q_table

 if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)
	import numpy as np
	import pandas as pd
	import time

	np.random.seed(2) # reproducible

	ROWS, COLS = 2, 2 # dimensions of the 2D world
	ACTIONS = ['left', 'right', 'up', 'down'] # available actions
	EPSILON = 0.9 # greedy policy
	ALPHA = 0.1 # learning rate
	GAMMA = 0.9 # discount factor
	MAX_EPISODES = 100 # maximum episodes
	FRESH_TIME = 0.3 # fresh time for one move

	def build_q_table(rows, cols, actions):
	state_size = rows * cols
	table = pd.DataFrame(
	np.zeros((state_size, len(actions))),
	columns=actions,
	)
	return table

	def choose_action(state, q_table):
	state_actions = q_table.iloc[state, :]
	if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):
	action_name = np.random.choice(ACTIONS)
	else:
	action_name = state_actions.idxmax()
	return action_name

	def get_env_feedback(S, A, grid):
	row, col = divmod(S, COLS)
	R = 0

	if A == 'up' and row > 0:
	R = -1 if grid[row][col] < grid[row-1][col] else 1
	grid[row][col], grid[row-1][col] = grid[row-1][col], grid[row][col]
	S_ = S - COLS

	elif A == 'down' and row < ROWS - 1:
	R = -1 if grid[row][col] < grid[row+1][col] else 1
	grid[row][col], grid[row+1][col] = grid[row+1][col], grid[row][col]
	S_ = S + COLS

	elif A == 'left' and col > 0:
	R = -1 if grid[row][col] < grid[row][col-1] else 1
	grid[row][col], grid[row][col-1] = grid[row][col-1], grid[row][col]
	S_ = S - 1

	elif A == 'right' and col < COLS - 1:
	R = -1 if grid[row][col] < grid[row][col+1] else 1
	grid[row][col], grid[row][col+1] = grid[row][col+1], grid[row][col]
	S_ = S + 1

	else:
	S_ = S

	return S_, R

	def update_env(grid, episode, step_counter):
	print(f'\rEpisode {episode+1}: total_steps = {step_counter}')
	for row in grid:
	print(' '.join(map(str, row)))
	time.sleep(FRESH_TIME)

	def rl():
	q_table = build_q_table(ROWS, COLS, ACTIONS)
	for episode in range(MAX_EPISODES):
	step_counter = 0
	grid = np.random.choice(range(1, ROWS*COLS+1), (ROWS, COLS), replace=False)
	S = np.random.randint(0, ROWS * COLS)

	update_env(grid, episode, step_counter)

	while True:
	A = choose_action(S, q_table)
	S_, R = get_env_feedback(S, A, grid)

	q_predict = q_table.loc[S, A]
	q_target = R + GAMMA * q_table.iloc[S_, :].max()

	q_table.loc[S, A] += ALPHA * (q_target - q_predict)
	S = S_

	step_counter += 1
	update_env(grid, episode, step_counter)

	# Check if the grid is sorted to terminate
	if np.all(grid == np.sort(grid, axis=None).reshape(ROWS, COLS)):
	break

	return q_table

	if __name__ == "__main__":
	q_table = rl()
	print('\r\nQ-table:\n')
	print(q_table)