Created
September 13, 2017 23:21
-
-
Save sir-deenicus/4f8d8bd1e3a5b0b5b30aa8e51961945f to your computer and use it in GitHub Desktop.
Simple Counter-factual regret minimization example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open Prelude.Common | |
open System | |
open Prelude.Math | |
open Prelude | |
open SimpleTrees | |
type Node = { | |
regretSum : double [] ; | |
strategySum : double []; | |
} | |
let getStrategy weight (strategySum:_[]) regretSum = | |
let strategy = Array.map (max 0.) regretSum | |
let nSum, numactions = Array.sum strategy, strategy.Length | |
let strategy' = | |
if nSum <= 0. then | |
Array.create numactions (1./float numactions) | |
else Array.map (fun x -> x/nSum) strategy | |
for a in 0..numactions - 1 do strategySum.[a] <- strategySum.[a] + weight * strategy'.[a] | |
strategy' | |
let getAvgStrategy strategySum = | |
let nSum, numactions = Array.sum strategySum, strategySum.Length | |
if nSum <= 0. then | |
Array.create numactions (1./float numactions) | |
else Array.map (fun x -> x/nSum) strategySum | |
let newNode n = { regretSum = Array.create n 0.; strategySum = Array.create n 0.} | |
let rec cfr d (nodeMap:Dict<_,_>) reward (actions:_[]) (history:string) p0 p1 = | |
let player = history.Length % 2 | |
let nummoves = actions.Length | |
let node = | |
match nodeMap.tryFind history with | |
| None -> let n = newNode nummoves in nodeMap.Add(history,n) ; n | |
| Some n -> n | |
match reward history with | |
| Some r -> r | |
| None -> | |
let strategy = getStrategy (if player = 0 then p0 else p1) node.strategySum node.regretSum | |
let util = Array.create nummoves 0. | |
let mutable nodeUtil = 0. | |
for a in 0..nummoves - 1 do | |
let h' = history + actions.[a] | |
util.[a] <- if player = 0 then | |
cfr (d+1) nodeMap reward actions h' (p0 * strategy.[a]) p1 | |
else cfr (d+1) nodeMap reward actions h' p0 (p1 * strategy.[a]) | |
nodeUtil <- nodeUtil + strategy.[a] * util.[a] | |
for a in 0..nummoves - 1 do | |
let regret = util.[a] - nodeUtil; | |
node.regretSum.[a] <- node.regretSum.[a] + (if player = 0 then p1 else p0) * regret | |
nodeUtil |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment