arshren · October 29, 2022 10:43
diff --git a/RL_Q_Learning.py b/RL_Q_Learning.py
 # Import required libararies
 import gym
 import matplotlib.pyplot as plt 
 import random
 import numpy as np
 from IPython.display import clear_output
 #Create an instance of the the Taxi-v3 environment
 env= gym.make("Taxi-v3").env

 # Creating the q-table
 q_table= np.zeros([env.observation_space.n, env.action_space.n])

 # define hyperparameters
 alpha=0.1
 gamma=0.6
 epsilon=0.4

 #Q-learning uses the maximum Q' over all possible actions for the next step
    
 all_rewards=[]
 episodes=100000
 frames=[]
 for i in range(1, episodes):
    state= env.reset()
    epochs, total_reward=0,0
    done=False
    while not done:
        # Behaviourial Policy that explores and exploits
        if random.uniform(0,1)<epsilon:
            #exploratory or behavioural policy
            # takes random action to explore the environment(action space)
            action = env.action_space.sample()  
        else:
            # Exploit learned values
            action = np.argmax(q_table[state])
        next_state, reward, done, info= env.step(action)
        old_value= q_table[state, action]
        
        # target policy that  uses the maximum Q' over all possible actions for the next state
        next_max= np.max( q_table[next_state])
        
        #Q-Learning maximizes the state-action value function(Q)
        #over all possible actions for the next steps.
        #Q(S, A) = Q(S, A) + α* (R + γ * maxQ(S', A) - Q(S, A))
        new_value=old_value + alpha *( reward + gamma * next_max) - old_value
        q_table[state,action]=new_value
        
        state= next_state
        #calculates the total reward
        total_reward+=reward
        frames.append(
        {
        'frame': env.render(mode='ansi'),
        'state': state,
        'action':action,
        'reward': reward,
        'episode':i
        }
        )
        
        epochs+=1
    # Register total_reward
    all_rewards.append(total_reward)
    if i%100==0:
        clear_output(wait=True)
        print("Episode: ", i , "Reward: ", total_reward, "Epochs: ", epochs)
	# Import required libararies
	import gym
	import matplotlib.pyplot as plt
	import random
	import numpy as np
	from IPython.display import clear_output
	#Create an instance of the the Taxi-v3 environment
	env= gym.make("Taxi-v3").env

	# Creating the q-table
	q_table= np.zeros([env.observation_space.n, env.action_space.n])

	# define hyperparameters
	alpha=0.1
	gamma=0.6
	epsilon=0.4

	#Q-learning uses the maximum Q' over all possible actions for the next step

	all_rewards=[]
	episodes=100000
	frames=[]
	for i in range(1, episodes):
	state= env.reset()
	epochs, total_reward=0,0
	done=False
	while not done:
	# Behaviourial Policy that explores and exploits
	if random.uniform(0,1)<epsilon:
	#exploratory or behavioural policy
	# takes random action to explore the environment(action space)
	action = env.action_space.sample()
	else:
	# Exploit learned values
	action = np.argmax(q_table[state])
	next_state, reward, done, info= env.step(action)
	old_value= q_table[state, action]

	# target policy that uses the maximum Q' over all possible actions for the next state
	next_max= np.max( q_table[next_state])

	#Q-Learning maximizes the state-action value function(Q)
	#over all possible actions for the next steps.
	#Q(S, A) = Q(S, A) + α* (R + γ * maxQ(S', A) - Q(S, A))
	new_value=old_value + alpha ( reward + gamma next_max) - old_value
	q_table[state,action]=new_value

	state= next_state
	#calculates the total reward
	total_reward+=reward
	frames.append(
	{
	'frame': env.render(mode='ansi'),
	'state': state,
	'action':action,
	'reward': reward,
	'episode':i
	}
	)

	epochs+=1
	# Register total_reward
	all_rewards.append(total_reward)
	if i%100==0:
	clear_output(wait=True)
	print("Episode: ", i , "Reward: ", total_reward, "Epochs: ", epochs)