Created
June 10, 2017 05:38
-
-
Save tsu-nera/42ed2cd86e9e0a0cf51fa865105795ac to your computer and use it in GitHub Desktop.
OpenAI Gym FrozenLake8x8 GA https://gym.openai.com/evaluations/eval_ttOXKJkCRJmS34oZCJNhrw
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import gym | |
| import numpy as np | |
| from gym import wrappers | |
| env = gym.make("FrozenLake8x8-v0") | |
| env = wrappers.Monitor(env, '/tmp/frozenlake-experiment-2') | |
| env.reset() | |
| n_states = env.observation_space.n | |
| n_actions = env.action_space.n | |
| def get_random_policy(): | |
| return np.random.choice(n_actions, size=n_states) | |
| def sample_reward(env, policy, t_max=100): | |
| total_reward = 0 | |
| s = 0 | |
| for _ in range(t_max): | |
| s, reward, is_done, _ = env.step(policy[s]) | |
| total_reward += reward | |
| if is_done: | |
| s = env.reset() | |
| break | |
| return total_reward | |
| def evaluate(policy, n_times=100): | |
| rewards = [sample_reward(env, policy) for _ in range(n_times)] | |
| return float(np.mean(rewards)) | |
| def crossover(policy1, policy2, p=0.5): | |
| idx = np.random.rand(n_states) < p | |
| return np.choose(idx, [policy1, policy2]) | |
| def mutation(policy, p=0.1): | |
| return crossover(get_random_policy(), policy, p) | |
| n_epochs = 50 #how many cycles to make | |
| pool_size = 100 #how many policies to maintain | |
| n_crossovers = 50 #how many crossovers to make on each step | |
| n_mutations = 50 #how many mutations to make on each tick | |
| print("initializing...") | |
| pool = [get_random_policy() for _ in range(pool_size)] | |
| pool_scores = [evaluate(policy) for policy in pool] | |
| for epoch in range(n_epochs): | |
| print("Epoch %s:"%epoch) | |
| crossovered = [crossover(pool[p1], pool[p2]) | |
| for p1, p2 in zip(np.random.choice(len(pool), size=n_crossovers), | |
| np.random.choice(len(pool), size=n_crossovers))] | |
| mutated = [mutation(pool[p]) for p in np.random.choice(len(pool), size=n_mutations)] | |
| #add new policies to the pool | |
| pool.extend(crossovered) | |
| pool.extend(mutated) | |
| pool_scores = [evaluate(p) for p in pool] | |
| #select pool_size best policies | |
| selected_indices = np.argsort(pool_scores)[-pool_size:] | |
| pool = [pool[i] for i in selected_indices] | |
| pool_scores = [pool_scores[i] for i in selected_indices] | |
| #print the best policy so far (last in ascending score order) | |
| print("best score:", pool_scores[-1]) | |
| env.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment