awjuliani/Q-Net Learning Clean.ipynb

Created August 25, 2016 20:30

Star (38) You must be signed in to star a gist
Fork (23) You must be signed in to fork a gist

Learn more about clone URLs
Clone this repository at <script src="https://gist.github.com/awjuliani/4d69edad4d0ed9a5884f3cdcf0ea0874.js"></script>
Save awjuliani/4d69edad4d0ed9a5884f3cdcf0ea0874 to your computer and use it in GitHub Desktop.

Download ZIP

Basic Q-Learning algorithm using Tensorflow

Raw

Q-Net Learning Clean.ipynb

Sorry, something went wrong. Reload?

Sorry, we cannot display this file.

Sorry, this file is invalid so it cannot be displayed.

bdytx5 commented Dec 3, 2020 •

edited

Loading

I made some additions to visualize the loss using tensorboard. Consider implementing these changes in the tutorial, as there is a good amount of confusion in the comments regarding the loss. Also would mention in the tutorial what version of python/tensorflow work for this tutorial (py 3.6.12 & tf 1.15). Enjoy!

import gym
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
# %matplotlib inline
env = gym.make('FrozenLake-v0')

tf.reset_default_graph()
#These lines establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1,16],dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16,4],0,0.01))
Qout = tf.matmul(inputs1,W)
predict = tf.argmax(Qout,1) # best action (index)

#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[1,4],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout)) 
lossSm = tf.summary.scalar("loss", loss) # scalar summary for tensorboard

trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)
init = tf.initialize_all_variables()

# Set learning parameters
y = .99
e = 0.1
num_episodes = 2000
#create lists to contain total rewards and steps per episode
jList = []
rList = []

with tf.Session() as sess:
    writer = tf.summary.FileWriter("log", sess.graph)

    sess.run(init)
    
    for i in range(num_episodes):
        #Reset environment and get first new observation
        s = env.reset()
        rAll = 0
        d = False
        j = 0
        #The Q-Network
        while j < 99:
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            a,allQ = sess.run([predict,Qout],feed_dict={inputs1:np.identity(16)[s:s+1]}) 
            if np.random.rand(1) < e:
                a[0] = env.action_space.sample()

            #Get new state and reward from environment
            s1,r,d,_ = env.step(a[0])
            
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout,feed_dict={inputs1:np.identity(16)[s1:s1+1]}) 
            
            #Obtain maxQ' and set our target value for chosen action.
            maxQ1 = np.max(Q1) # NOT argmax - just the max q-val
            targetQ = allQ # allQ = sess.run(Qout,s:s+1) -- prev q-vals
            targetQ[0,a[0]] = r + y*maxQ1 # a = sess.run(predict,s:s+1)  -- Bellman eq.
            
            #Train our network using target and predicted Q values
            # The W1 is not used, but left for visualization purposes for those interested 
            # in what the value is. The sess.run line is indeed updating W. - awjuliani
            lo,_,W1 = sess.run([lossSm,updateModel,W],feed_dict={inputs1:np.identity(16)[s:s+1],nextQ:targetQ})
            rAll += r
            s = s1
            writer.add_summary(lo) # visualize loss fuction 
            if d == True:
                #Reduce chance of random action as we train the model.
                e = 1./((i/50) + 10)
                break
        jList.append(j)
        rList.append(rAll)
plt.plot(rList)

matlabninja commented Feb 23, 2021 •

edited

Loading

I've come across an oddity that I'm having trouble understanding. Running this code as written in Tensorflow works just fine for me, but trying to re-implement it in another framework (Pytorch, Keras) left me with a network that seemed unable to learn the game. It looks to me like the randomly initialized weights in the linear layer pass on bogus future reward estimates when the agent loses the game.
Explained another way, when the agent lost the game, instead of getting 0 reward for that action, it was getting 0 plus the max future reward for the "next step" of the game instead of just 0. I was able to get the agent to learn the game with this modification:

        if d == True:
            targetQ[0,a] = r
        else:
            targetQ[0,a] = r + y*maxQ1

Based on the explanation I've come up with, this modification makes perfect sense to me, but I'm left wondering why the example here does not have the same issue. Thoughts?
Reference code:

class FrozenLakeNet(nn.Module):
    def __init__(self):
        super(FrozenLakeNet,self).__init__()
        self.fc = nn.Linear(16,4,bias=False)
        self.fc.weight.data.uniform_(0,.01)
    def forward(self,xIn):
        x = self.fc(xIn)
        return(x)

# Create list of state vecotrs on device
states = []
device = torch.device('cuda:0')
for s in range(16):
    sv = torch.tensor(np.identity(16)[s:s+1].astype(np.float32))
    svG = sv.to(device)
    states.append(svG)

# Initialize network
net = FrozenLakeNet()
net.to(device)
# Setup loss
criterion = nn.MSELoss(reduction='sum')
# Setup optimizer
optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0)

# Set learning parameters
y = .99
num_episodes = 50000
#create lists to contain total rewards and steps per episode
jList = []
rList = []
e = .25
randSel = 0
tot = 0
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    sSeq = []
    # Set random epsilon for episode
    #eps = 1-(i/num_episodes)
    #The Q-Table learning algorithm
    while j < 99:
        j+=1
        # Zero gradients
        optimizer.zero_grad()
        #Choose an action by greedily (with e chance of random action) from the Q-network
        #a,allQ = sess.run([predict,Qout],feed_dict={inputs1:np.identity(16)[s:s+1]})
        allQ = net(states[s])
        # Convert the state to an action
        a = int(torch.argmax(allQ).cpu().detach())
        tot += 1
        if np.random.rand(1) < e:
            randSel+=1
            a = env.action_space.sample()
        #Get new state and reward from environment
        s1,r,d,_ = env.step(a)
        # Get predicted Q values from new state
        #Q1 = sess.run(Qout,feed_dict={inputs1:np.identity(16)[s1:s1+1]})
        Q1 = net(states[s1])
        # Get the value of the 'best' action from the network
        maxQ1 = torch.max(Q1)
        # Get the target Q from the initial state
        targetQ = allQ.clone()
        # Update the target Q with new information
        if d == True:
            targetQ[0,a] = r
        else:
            targetQ[0,a] = r + y*maxQ1 ### Using this reward regardless of "done" output results in not learning the game.
        #Train our network using target and predicted Q values
        #_,W1 = sess.run([updateModel,W],feed_dict={inputs1:np.identity(16)[s:s+1],nextQ:targetQ})
        # Compute the loss
        loss = criterion(targetQ,allQ)
        # Compute gradients
        loss.backward()
        # Apply learnings
        optimizer.step()
        rAll += r
        s = s1
        
        if d == True:
            e = 1./((i/50.) + 4)
            break
    jList.append(j)
    rList.append(rAll)

awjuliani/Q-Net Learning Clean.ipynb

bdytx5 commented Dec 3, 2020 •

edited

Loading

Uh oh!

matlabninja commented Feb 23, 2021 •

edited

Loading

Uh oh!

awjuliani/Q-Net Learning Clean.ipynb

bdytx5 commented Dec 3, 2020 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

matlabninja commented Feb 23, 2021 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

bdytx5 commented Dec 3, 2020 •

edited

Loading

matlabninja commented Feb 23, 2021 •

edited

Loading