Created
September 4, 2019 06:11
-
-
Save lynaghk/03c6e519cd051a2e252262946a0a80f6 to your computer and use it in GitHub Desktop.
Reinforcement learning sketch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns reinforcement-learning | |
(:require [clojure.set :refer [difference union]] | |
[clojure.string :as str] | |
[lonocloud.synthread :as ->])) | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;; Tic Tac Toe bits | |
;; based on https://github.com/paraseba/tictactoe/blob/master/src/tictactoe/core.clj | |
(def empty-board | |
{:x #{} | |
:o #{}}) | |
(def all-cells | |
(set (range 9))) | |
(def win-cells | |
(let [row1 #{0 1 2} row2 #{3 4 5} row3 #{6 7 8} | |
col1 #{0 3 6} col2 #{1 4 7} col3 #{2 5 8} | |
dia1 #{0 4 8} dia2 #{2 4 6}] | |
[row1 row2 row3 col1 col2 col3 dia1 dia2])) | |
(defn empty-cells | |
[board] | |
(difference all-cells (:x board) (:o board))) | |
(defn won? | |
[cells] | |
(some #(every? cells %) win-cells)) | |
(defn winner | |
[board] | |
(cond | |
(won? (:x board)) :x | |
(won? (:o board)) :o)) | |
(defn draw? | |
[board] | |
(empty? (empty-cells board))) | |
(defn mark | |
[board cell] | |
(assert (contains? (empty-cells board) cell)) | |
(let [turn (if (> (count (:x board)) | |
(count (:o board))) | |
:o | |
:x)] | |
(update board turn conj cell))) | |
(defn print-board | |
[board] | |
(doseq [row (range 3)] | |
(println (str/join "|" (for [col (range 3)] | |
(let [idx (+ col (* row 3))] | |
(cond | |
(contains? (:x board) idx) "X" | |
(contains? (:o board) idx) "O" | |
:else " ")))))) | |
(println)) | |
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
;; Reinforcement learning bits | |
(def initial-values | |
"Map of board state -> value (from perspective of player x)." | |
{}) | |
(def default-value | |
"Default value assigned to unknown state" | |
0.5) | |
(defn get-value | |
[values state] | |
(let [w (winner state)] | |
(cond | |
(= :x w) | |
1 | |
(= :o w) | |
0 | |
(draw? state) | |
0 | |
:else | |
(get values state default-value)))) | |
(def α | |
0.2) | |
(defn update-values | |
[values state next-state] | |
(assoc values state (let [v (get-value values state)] | |
(+ v (* α (- (get-value values next-state) | |
v)))))) | |
(defn random-next-state | |
[state] | |
(-> state | |
(->/when-let [cell (rand-nth (seq (empty-cells state)))] | |
(mark cell)))) | |
(defn best-next-state | |
[state values] | |
(let [[top-value states] (->> (empty-cells state) | |
(map (partial mark state)) | |
(group-by values) | |
(sort-by first) | |
last)] | |
(rand-nth states))) | |
(defn run | |
[num-iterations] | |
(loop [idx 0 | |
player :x | |
values initial-values | |
state empty-board] | |
(cond | |
;;done; save values so we can inspect at repl | |
(= num-iterations idx) | |
values | |
;;game is over, begin again | |
(or (draw? state) (winner state)) | |
(recur (inc idx) :x values empty-board) | |
:else | |
(let [next-state (case player | |
:x | |
(if (> (rand) 0.2) | |
;;greedy move | |
(best-next-state state values) | |
;;exploratory move | |
(random-next-state state)) | |
:o | |
(random-next-state state))] | |
(recur (inc idx) | |
(case player :x :o :o :x) | |
(update-values values state next-state) | |
next-state))))) | |
(comment | |
(def values | |
(run 200000)) | |
(->> values | |
(sort-by second) | |
reverse | |
(take 10) | |
(map first) | |
(map print-board) | |
(doall)) | |
(doseq [[board v] (->> (range 9) | |
(map #(mark empty-board %)) | |
(map (juxt identity values)) | |
(sort-by second))] | |
(println v) | |
(print-board board) | |
(println "")) | |
(-> empty-board | |
(best-next-state values) (->/aside state (print-board state)) | |
(mark 4) (->/aside state (print-board state)) | |
(best-next-state values) (->/aside state (print-board state)) | |
;; (next-states-by-value values) | |
;; (->> | |
;; (map (juxt identity (partial get-value values)) )) | |
(mark 2) (->/aside state (print-board state)) | |
(best-next-state values) (->/aside state (print-board state)) | |
(mark 7) (->/aside state (print-board state)) | |
(best-next-state values) (->/aside state (print-board state)) | |
) | |
(print-board {:x #{7 4 8}, :o #{0 3 2}}) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment