Created
November 15, 2016 09:23
-
-
Save simeneide/f3d33868c8fc36ed3934da4aa71847f0 to your computer and use it in GitHub Desktop.
blackjack models - first applying a simple threshold model then using more info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[3]: | |
import gym | |
import numpy as np | |
import tensorflow as tf | |
import keras | |
# In[94]: | |
def play_episode(policy): | |
obs = env.reset() | |
for _ in range(10): | |
action = policy(obs) | |
obs, reward, done, info = env.step(action) | |
#print obs, reward, done, info | |
if done: | |
#print 'Our hand: ' + str(obs[0]) + '\t Dealer: ' + str(obs[1]) + '\t reward: ' + str(reward) | |
break | |
return obs,action, reward | |
# ## Simple Threshold Strategy | |
# In[34]: | |
env = gym.make('Blackjack-v0') | |
threshold_policy = lambda obs: (obs[0] <= th) | |
# In[722]: | |
threshold = xrange(21) | |
score = [] | |
OBS = [] | |
ACTION = [] | |
for th in threshold: | |
R = [] | |
for k in xrange(500): | |
obs,action , reward = play_episode(threshold_policy) | |
OBS.append(obs) | |
ACTION.append(action) | |
R.append(int(reward)) | |
score.append(np.mean(R)) | |
get_ipython().magic(u'matplotlib inline') | |
import matplotlib.pyplot as plt | |
plt.plot(threshold,score) | |
print score | |
# ## Using Linear Regression | |
# In[712]: | |
get_ipython().magic(u'matplotlib inline') | |
import matplotlib.pyplot as plt | |
from keras.models import Sequential | |
from keras.layers import Dense, Activation | |
from keras.regularizers import l2 | |
from keras.optimizers import SGD | |
# In[ ]: | |
from keras.utils import np_utils | |
y = np.array(y) | |
def proc_input(OBS,ACTION): | |
X = np.zeros((OBS.shape[0],4)) | |
X[:,:-1] = OBS | |
X[:,3] = ACTION | |
return X | |
def Q_policy(obs): | |
if random.random() < ep: | |
return int(random.random()>0) | |
else: | |
X = np.zeros([2,4]) | |
X[:,0:3] = obs | |
X[:,3] = [0,1] | |
Q = Qmodel.predict(X) | |
return Q.argmax() | |
# In[749]: | |
# Define model: | |
Qmodel = Sequential() | |
Qmodel.add(Dense(output_dim=4, input_dim=4, init='uniform')) | |
Qmodel.add(keras.layers.normalization.BatchNormalization()) | |
Qmodel.add(Activation("relu")) | |
Qmodel.add(Dense(output_dim=1, W_regularizer=l2(0.01))) | |
sgd = SGD(lr = 0.005) | |
Qmodel.compile(loss='mean_squared_error', optimizer=sgd, metrics=['mean_squared_error']) | |
ep = 0.1 | |
env.monitor.start('/tmp/cartpole-experiment-1', video_callable=False) | |
score = [] | |
for episode in xrange(200): | |
OBS = [] | |
R = [] | |
ACTION = [] | |
for k in xrange(200): | |
obs,action , reward = play_episode(Q_policy) | |
OBS.append(obs) | |
ACTION.append(action) | |
R.append(int(reward)) | |
score.append(np.mean(R)) | |
if (episode > 50) & (np.max(score) > 0.92) & (ep>0.01): | |
ep = 0.01 | |
print 'Entering low-epsilon mode at episode ' + str(episode) | |
if (episode > 50) & (np.max(score) > 0.97) & (ep>0): | |
ep = 0.0 | |
print 'Entering greedy-mode at episode' + str(episode) | |
X = proc_input(np.array(OBS),ACTION) | |
y = np.array(R) | |
Qmodel.train_on_batch(X,y) | |
env.monitor.close() | |
plt.plot(score) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment