Skip to content

Instantly share code, notes, and snippets.

@daskol
Created April 9, 2017 13:02
Show Gist options
  • Save daskol/c8a448ef6fefcc9a7a18368c6908aa93 to your computer and use it in GitHub Desktop.
Save daskol/c8a448ef6fefcc9a7a18368c6908aa93 to your computer and use it in GitHub Desktop.
RandomPolicyAgent for FrozenLake-v0
#!/usr/bin/env python3
# frozen-lake.py
"""FrozenLake игрушечная задача для обучения с подкреплением.
"""
import click
import gym
import gym.wrappers.monitoring
import logging
import numpy as np
from tqdm import tqdm
class RandomPolicyAgent(object):
def __init__(self, env=None, num_episodes=100, num_samples=100, policy=None):
self.env = env or gym.make('FrozenLake-v0')
self.n_actions = env.action_space.n
self.n_states = env.observation_space.n
self.n_episodes = num_episodes
self.episode = 0
self.num_samples = num_samples
self.best_policy = policy
self.best_reward = float('-inf')
self.last_policy = self.random_policy()
self.last_reward = None
def play(self, eval=False):
# Сэмплируем новую политику?
sample = self.episode % self.num_samples
if sample == 0 and not eval:
self.last_policy = self.random_policy() # новая policy
self.last_reward = np.zeros(self.num_samples)
elif eval:
self.last_policy = self.best_policy
# Играем один эпизод
policy = self.last_policy
state = self.env.reset()
total_reward = 0
for _ in range(self.n_episodes):
action = policy[state]
state, reward, done, _ = self.env.step(action)
total_reward += reward
if done:
break
# Обновляем счётчики
self.last_reward[sample] = total_reward
self.episode += 1 # так мы можем проверить новую стратегию
# Нашли лушую политику?
if sample + 1 == self.num_samples \
and self.last_reward.mean() > self.best_reward:
self.best_reward = self.last_reward.mean()
self.best_policy = self.last_policy
logging.info('best reward is %7.3f', self.best_reward)
return total_reward
def evaluate(self, num_samples=100):
rewards = np.zeros(num_samples)
backup_num_samples = self.num_samples
self.num_samples = num_samples
self.last_reward = np.zeros(self.num_samples)
for i in range(num_samples):
rewards[i] = self.play(eval=True)
self.num_samples = backup_num_samples
self.last_reward = np.zeros(self.num_samples)
return rewards.mean()
def random_policy(self, size=None):
size = size or self.n_states
return np.random.randint(0, self.n_actions, size=size)
def window_score(rewards, length):
best_start = None
best_score = float('-inf')
for i in range(rewards.shape[0] - length):
score = rewards[i:i + length].mean()
if score > best_score:
best_score = score
best_start = i
return best_score
def evaluate(agent, train_steps=100000, eval_steps=100):
logging.info('train agent on %d steps', eval_steps)
logging.info('evaluate agent on %d steps', eval_steps)
rewards = np.zeros(train_steps)
for i in tqdm(range(train_steps)):
rewards[i] = agent.play()
reward = window_score(rewards, eval_steps)
return reward, rewards
@click.command(help=__doc__)
@click.option('--episodes', default=100000)
@click.option('--monitor', default=None)
def main(episodes, monitor):
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if monitor:
logger = logging.getLogger(gym.wrappers.monitoring.__name__)
logger.setLevel(logging.WARNING)
env = gym.make('FrozenLake-v0')
env = gym.wrappers.Monitor(env, monitor, force=True)
else:
env = gym.make('FrozenLake-v0')
agent = RandomPolicyAgent(env, num_samples=100)
score, hist = evaluate(agent, episodes)
logging.info('Final avarage: %7.3f', score)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment