Skip to content

Instantly share code, notes, and snippets.

@simeneide
Created November 15, 2016 09:23
Show Gist options
  • Save simeneide/f3d33868c8fc36ed3934da4aa71847f0 to your computer and use it in GitHub Desktop.
Save simeneide/f3d33868c8fc36ed3934da4aa71847f0 to your computer and use it in GitHub Desktop.
blackjack models - first applying a simple threshold model then using more info
# coding: utf-8
# In[3]:
import gym
import numpy as np
import tensorflow as tf
import keras
# In[94]:
def play_episode(policy):
obs = env.reset()
for _ in range(10):
action = policy(obs)
obs, reward, done, info = env.step(action)
#print obs, reward, done, info
if done:
#print 'Our hand: ' + str(obs[0]) + '\t Dealer: ' + str(obs[1]) + '\t reward: ' + str(reward)
break
return obs,action, reward
# ## Simple Threshold Strategy
# In[34]:
env = gym.make('Blackjack-v0')
threshold_policy = lambda obs: (obs[0] <= th)
# In[722]:
threshold = xrange(21)
score = []
OBS = []
ACTION = []
for th in threshold:
R = []
for k in xrange(500):
obs,action , reward = play_episode(threshold_policy)
OBS.append(obs)
ACTION.append(action)
R.append(int(reward))
score.append(np.mean(R))
get_ipython().magic(u'matplotlib inline')
import matplotlib.pyplot as plt
plt.plot(threshold,score)
print score
# ## Using Linear Regression
# In[712]:
get_ipython().magic(u'matplotlib inline')
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.regularizers import l2
from keras.optimizers import SGD
# In[ ]:
from keras.utils import np_utils
y = np.array(y)
def proc_input(OBS,ACTION):
X = np.zeros((OBS.shape[0],4))
X[:,:-1] = OBS
X[:,3] = ACTION
return X
def Q_policy(obs):
if random.random() < ep:
return int(random.random()>0)
else:
X = np.zeros([2,4])
X[:,0:3] = obs
X[:,3] = [0,1]
Q = Qmodel.predict(X)
return Q.argmax()
# In[749]:
# Define model:
Qmodel = Sequential()
Qmodel.add(Dense(output_dim=4, input_dim=4, init='uniform'))
Qmodel.add(keras.layers.normalization.BatchNormalization())
Qmodel.add(Activation("relu"))
Qmodel.add(Dense(output_dim=1, W_regularizer=l2(0.01)))
sgd = SGD(lr = 0.005)
Qmodel.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error'])
ep = 0.1
env.monitor.start('/tmp/cartpole-experiment-1', video_callable=False)
score = []
for episode in xrange(200):
OBS = []
R = []
ACTION = []
for k in xrange(200):
obs,action , reward = play_episode(Q_policy)
OBS.append(obs)
ACTION.append(action)
R.append(int(reward))
score.append(np.mean(R))
if (episode > 50) & (np.max(score) > 0.92) & (ep>0.01):
ep = 0.01
print 'Entering low-epsilon mode at episode ' + str(episode)
if (episode > 50) & (np.max(score) > 0.97) & (ep>0):
ep = 0.0
print 'Entering greedy-mode at episode' + str(episode)
X = proc_input(np.array(OBS),ACTION)
y = np.array(R)
Qmodel.train_on_batch(X,y)
env.monitor.close()
plt.plot(score)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment