Skip to content

Instantly share code, notes, and snippets.

@hodzanassredin
Last active December 25, 2015 14:39
Show Gist options
  • Save hodzanassredin/6993020 to your computer and use it in GitHub Desktop.
Save hodzanassredin/6993020 to your computer and use it in GitHub Desktop.
incrDawgImmutable
namespace Utils
open System
open System.Collections.Generic
open System.IO
open System.Linq
module IncrementalDawg =
type StateKey = string
type State<'a when 'a : comparison> =
{ id : uint32; final : bool; edges : Map<'a, State<'a>>; key : StateKey }
let keygen final (edges:Map<'a, State<'a>>) : StateKey =
let res = new System.Text.StringBuilder()
if final then res.Append("1") |> ignore
else res.Append("0") |> ignore
for (kv) in (edges.OrderBy(fun x -> x.Key)) do
res.AppendFormat("_{0}_{1}", kv.Key.ToString(), kv.Value.id.ToString()) |> ignore
res.ToString()
let emptyState() = { id = 0u; final = false; edges = Map.empty; key = (keygen false Map.empty) }
type Wr<'a> = {r:WeakReference}
let box (state:State<'a>) = {r=new WeakReference(state)}
let unbox<'a when 'a : comparison> (wr:Wr<'a>) : option<State<'a>> = if wr.r.IsAlive then Some(wr.r.Target :?> State<'a>) else None
type Dawg<'a when 'a : comparison> =
{ register : Dictionary<StateKey, Wr<'a>>; root : State<'a>; lastId : uint32 }
let getLiveStates<'a when 'a : comparison> (register:Dictionary<StateKey, Wr<'a>>) =
register
|> Seq.map (fun kv-> (unbox kv.Value))
|> Seq.choose (fun v -> v)
let rec reversePrefix (node : State<'a>) (word : list<'a>) (acc : list<option<'a> * State<'a>>) =
match word with
| nextLetter :: suffix when node.edges.ContainsKey nextLetter ->
reversePrefix node.edges.[nextLetter] suffix ((Some(nextLetter), node) :: acc)
| nextLetter :: suffix -> ((Some(nextLetter), node) :: acc, word |> List.rev)
| [] -> ((None, node) :: acc, [])
let rec private getNode state word =
match word with
| letter :: suffix when state.edges.ContainsKey letter-> getNode state.edges.[letter] suffix
| letter :: suffix -> None
| _ -> Some(state)
let private getMinimized dawg state =
let k = state.key
if k |> dawg.register.ContainsKey then
match unbox dawg.register.[k] with
| Some(state) -> state
| None -> dawg.register.Remove(k) |> ignore
dawg.register.Add(k, box state)
state
else
dawg.register.Add(k, box state)
state
let private clone dawg id node childNletter =
let e, final = match childNletter with
|Some(letter, child) -> node.edges.Add(letter, child), node.final
|None -> node.edges, true
let newNode = { id = id + 1u; final = final; edges = e; key = keygen final e}
let min = getMinimized dawg newNode
min, if min.id = newNode.id then newNode.id else id
let empty() = { register = new Dictionary<_, _>(); root = emptyState(); lastId = 0u }
let private createSuffixStates dawg (reverseSuffix:list<'a>) lastId =
let rec createSuffixStatesin dawg (reverseSuffix:list<'a>) lastId (child:option<State<'a>>) =
match (reverseSuffix,child) with
| (letter :: prefix, _) ->
let child = match child with
|Some(c) -> (Some(letter,c))
|None -> None
let node, lastId = clone dawg lastId (emptyState()) child
createSuffixStatesin dawg prefix lastId (Some(node))
| ([], child) -> child, lastId
| _ -> failwith "unreachable code"
createSuffixStatesin dawg reverseSuffix lastId None
let rec clonenodes dawg revstates (child:option<State<'a>>) lastId =
match (revstates, child) with
| ((Some(letter),state) :: prefix, Some(c)) ->
let currentState, lastId = clone dawg lastId state (Some(letter, c))
clonenodes dawg prefix (Some(currentState)) lastId
| ((None, state) :: [], None) when state.final -> state, lastId
| ((None, state) :: [], None) -> clone dawg lastId state None
| ([], Some(c)) -> c, lastId
| _ -> failwith "unreachable code"
let add (word:list<'a>) (dawg:Dawg<'a>) : Dawg<'a> =
let revPrefixStates, revSuffix = reversePrefix dawg.root word []
if revSuffix.Length = 0 && (revPrefixStates.Head |> snd).final then dawg
else let new_state, lastId = createSuffixStates dawg revSuffix dawg.lastId
let prev_state, lastId = clonenodes dawg revPrefixStates new_state lastId
{dawg with root = prev_state; lastId = lastId}
let compact dawg =
GC.Collect()
let garbage = dawg.register.Keys |> Seq.where (fun key -> not dawg.register.[key].r.IsAlive) |> List.ofSeq
for key in garbage do
dawg.register.Remove key |> ignore
let compactIdMap dawg =
let lastId = ref 0u
let idmap = new Dictionary<_,_>()
let ids = dawg.register
|> getLiveStates
|> Seq.map (fun v -> lastId := !lastId + 1u
(v.id, !lastId))
for (id,compactId) in ids do
idmap.Add(id, compactId)
idmap
let nodeCount dawg = dawg.register |> Seq.length |> (+) 1
let edgeCount dawg =
dawg.register
|> getLiveStates
|> Seq.map (fun node -> node.edges |> Seq.length)
|> Seq.sum
|> (+) dawg.root.edges.Count
let getPaths dawg word =
let rec getPathsin node : seq<seq<'a>> =
node.edges
|> Seq.map (fun kv -> (kv.Key, getPathsin kv.Value))
|> Seq.collect (fun (head, tails) ->
seq {
if node.final then yield seq { yield head }
yield! tails |> Seq.map (fun path ->
seq {
yield head
yield! path
})
})
match getNode dawg.root word with
| Some(node) ->
seq {
if node.final then yield (word |> Seq.ofList)
yield! getPathsin node |> Seq.map (fun path ->
seq {
yield! word
yield! path
})
}
| None -> Seq.empty
let contains dawg word =
match getNode dawg.root word with
| Some(node) -> node.final
| None -> false
let save dawg filename =
let idMap = compactIdMap dawg
use wr = new StreamWriter(filename, false)
for node in getLiveStates dawg.register do
if node.edges.Count = 0 then sprintf "%d\t%b" idMap.[node.id] node.final |> wr.WriteLine
else
let nodes =
node.edges
|> Seq.map (fun kv -> sprintf "%s\t%d" (kv.Key.ToString()) idMap.[kv.Value.id])
|> String.concat "\t"
sprintf "%d\t%b\t%s" idMap.[node.id] node.final nodes |> wr.WriteLine
// let nodes =
// dawg.root.edges
// |> Seq.map (fun kv -> sprintf "%s\t%d" (kv.Key.ToString()) idMap.[kv.Value.id])
// |> String.concat "\t"
// sprintf "%d\t%b\t%s" dawg.root.id dawg.root.final nodes |> wr.WriteLine
let load (name : string) parse =
let nodes = new Dictionary<_, _>()
let idMap = new Dictionary<_, _>()
use r = new StreamReader(name)
let mutable lastId = 0u
let mutable lastNode = emptyState()
while not r.EndOfStream do
let line = r.ReadLine()
if line <> "" then
let chunks = line.Split('\t')
let id = UInt32.Parse(chunks.[0])
if id > lastId then lastId <- id
let final = Boolean.Parse(chunks.[1])
let edges = [2..2..(chunks.Length - 1)]
|> Seq.fold(fun (m:Map<'a, State<'a>>) i ->
let childId = UInt32.Parse(chunks.[i + 1])
let child = idMap.[childId]
m.Add(parse chunks.[i], child)) Map.empty
let node = {id = id; final = final; edges = edges; key = keygen final edges}
idMap.Add(id, node)
let key = keygen final edges
nodes.Add(node.key, box node)
lastNode <- node
{ Dawg.register = nodes; root = lastNode; lastId = lastId }
// Learn more about F# at http://fsharp.net
// See the 'F# Tutorial' project for more help.
//lastlines inserted 2000000
//edgeCount 111
//nodeCount 13
//max in edges count 1800001
//Elapsed secs 13.259920
open Utils.IncrementalDawg
open System.Diagnostics
open System.Numerics
open System
[<EntryPoint>]
let main argv =
let item (str:string) = str.ToCharArray() |> List.ofArray
let mutable WordCount = 0
let max = 100
let words = [0..max] |> List.map (fun x -> x.ToString() |> item)
let stopWatch = Stopwatch.StartNew()
let rec addWords words dawg =
match words with
| word :: words -> add word dawg |> addWords words
| _ -> dawg
let e : Dawg<Char> = empty()
let dawg : Dawg<Char> = e |> add List.empty |> addWords words
// let ec = edgeCount dawg
// let inec = int(inEdgeCount dawg)
// assert (ec = inec)
//if ( WordCount % 100 ) = 0 then printfn "inserted words %d" WordCount
stopWatch.Stop()
compact dawg
max |> printfn "lines inserted %d"
dawg |> edgeCount |> printfn "edgeCount %d"
dawg |> nodeCount |> printfn "nodeCount %d"
printfn "Elapsed secs %f" stopWatch.Elapsed.TotalSeconds
// for (_,word) in words do
// assert (word |> contains dawg)
// assert ("1" |> item |> contains dawg)
// assert ((max+1).ToString() |> item |> contains dawg |> not)
save dawg "test.dawg"
let dawg = load "test.dawg" Char.Parse
for word in words do
assert (word |> contains dawg)
assert ("1" |> item |> contains dawg)
assert ((max+1).ToString() |> item |> contains dawg |> not)
let t = (max /100).ToString()
for path in ( t |> item |> getPaths dawg) do
printfn "%A" path
//4.2
//8.17 405
0 // return an integer exit code
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment