Created
October 29, 2022 10:43
-
-
Save arshren/3834d19b57836faf63b6b041a57107ed to your computer and use it in GitHub Desktop.
Q-Learning
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import required libararies | |
import gym | |
import matplotlib.pyplot as plt | |
import random | |
import numpy as np | |
from IPython.display import clear_output | |
#Create an instance of the the Taxi-v3 environment | |
env= gym.make("Taxi-v3").env | |
# Creating the q-table | |
q_table= np.zeros([env.observation_space.n, env.action_space.n]) | |
# define hyperparameters | |
alpha=0.1 | |
gamma=0.6 | |
epsilon=0.4 | |
#Q-learning uses the maximum Q' over all possible actions for the next step | |
all_rewards=[] | |
episodes=100000 | |
frames=[] | |
for i in range(1, episodes): | |
state= env.reset() | |
epochs, total_reward=0,0 | |
done=False | |
while not done: | |
# Behaviourial Policy that explores and exploits | |
if random.uniform(0,1)<epsilon: | |
#exploratory or behavioural policy | |
# takes random action to explore the environment(action space) | |
action = env.action_space.sample() | |
else: | |
# Exploit learned values | |
action = np.argmax(q_table[state]) | |
next_state, reward, done, info= env.step(action) | |
old_value= q_table[state, action] | |
# target policy that uses the maximum Q' over all possible actions for the next state | |
next_max= np.max( q_table[next_state]) | |
#Q-Learning maximizes the state-action value function(Q) | |
#over all possible actions for the next steps. | |
#Q(S, A) = Q(S, A) + α* (R + γ * maxQ(S', A) - Q(S, A)) | |
new_value=old_value + alpha *( reward + gamma * next_max) - old_value | |
q_table[state,action]=new_value | |
state= next_state | |
#calculates the total reward | |
total_reward+=reward | |
frames.append( | |
{ | |
'frame': env.render(mode='ansi'), | |
'state': state, | |
'action':action, | |
'reward': reward, | |
'episode':i | |
} | |
) | |
epochs+=1 | |
# Register total_reward | |
all_rewards.append(total_reward) | |
if i%100==0: | |
clear_output(wait=True) | |
print("Episode: ", i , "Reward: ", total_reward, "Epochs: ", epochs) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment