Skip to content

Instantly share code, notes, and snippets.

@hodzanassredin
Last active December 24, 2015 09:49
Show Gist options
  • Save hodzanassredin/6779538 to your computer and use it in GitHub Desktop.
Save hodzanassredin/6779538 to your computer and use it in GitHub Desktop.
naive incremental(unsorted) dawg in fsharp
namespace Utils
open System
open System.Collections.Generic
open System.IO
open System.Linq
module IncrementalDawg =
type StateKey = string
type State<'a when 'a : equality> =
{ mutable id : uint32
mutable final : bool
edges : Dictionary<'a, State<'a>>
mutable inEdges : uint32
mutable key : StateKey }
let private (===) x y = x.id = y.id
let emptyState id =
{ id = id
final = false
edges = new Dictionary<_, _>()
inEdges = 0u
key = StateKey.Empty }
let rec decreaseInEdge node =
if node.inEdges > 0u then node.inEdges <- node.inEdges - 1u
if node.inEdges = 0u then
for child in node.edges.Values do
decreaseInEdge child
let setEdge parent symbol child =
//assert (child.inEdges < UInt32.MaxValue)
child.inEdges <- child.inEdges + 1u
if parent.edges.ContainsKey symbol then
let prev = parent.edges.[symbol]
//assert (prev.inEdges > 0u)
decreaseInEdge prev
parent.edges.[symbol] <- child
else parent.edges.Add(symbol, child)
parent.key <- StateKey.Empty
let key node : StateKey =
if node.key = StateKey.Empty then
let res = new System.Text.StringBuilder()
if node.final then res.Append("1") |> ignore
else res.Append("0") |> ignore
for (kv) in (node.edges.OrderBy(fun x -> x.Key)) do
res.AppendFormat("_{0}_{1}", kv.Key.ToString(), kv.Value.id.ToString()) |> ignore
node.key <- res.ToString()
node.key
type Dawg<'a when 'a : equality> =
{ register : Dictionary<StateKey, State<'a>>
root : State<'a>
mutable lastId : uint32 }
let rec reversePrefix (node : State<'a>) (word : list<'a>) (acc : list<option<'a> * State<'a>>) =
match word with
| nextLetter :: suffix when node.edges.ContainsKey nextLetter ->
reversePrefix node.edges.[nextLetter] suffix ((Some(nextLetter), node) :: acc)
| nextLetter :: _ -> ((Some(nextLetter), node) :: acc, word |> List.rev)
| _ -> ((None, node) :: acc, word |> List.rev)
let private isConfluence state = state.inEdges > 1u
let rec private splitByConfluence dawg states acc =
match states with
| (_, state) :: _ when isConfluence state -> acc, states
| head :: tail -> splitByConfluence dawg tail (head :: acc)
| _ -> acc, []
let rec private getNode state word =
match word with
| letter :: suffix ->
if state.edges.ContainsKey letter then getNode state.edges.[letter] suffix
else None
| _ -> Some(state)
let private clone id node =
for edge in node.edges do
edge.Value.inEdges <- edge.Value.inEdges + 1u
{ id = id
final = node.final
edges = new Dictionary<_, _>(node.edges)
inEdges = 0u
key = node.key }
let private getMinimized dawg state =
let k = key state
if k |> dawg.register.ContainsKey then dawg.register.[k]
else
dawg.register.Add(k, state)
state
let empty() =
{ register = new Dictionary<_, _>()
root = emptyState 0u
lastId = 0u }
let private createSuffixStates dawg reverseSuffix idgen =
let mutable child = None
let mutable childLetter = None
for letter in reverseSuffix do
let node =
{ id = idgen()
final = child.IsNone
edges = new Dictionary<_, _>()
inEdges = 0u
key = StateKey.Empty }
if child.IsSome then setEdge node childLetter.Value child.Value
child <- Some(getMinimized dawg node)
childLetter <- Some(letter)
child
let compactIds dawg =
dawg.lastId <- 0u
let states =
dawg.register
|> Seq.map (fun kv ->
dawg.lastId <- dawg.lastId + 1u
kv.Value.id <- dawg.lastId
kv.Value)
|> List.ofSeq
dawg.register.Clear()
for state in states do
dawg.register.Add(key state, state)
dawg.lastId <- dawg.lastId + 1u
if dawg.lastId = 0u then failwith "id overflow"
let add dawg word : unit =
let id() =
dawg.lastId <- dawg.lastId + 1u
//if dawg.lastId = 0u then compactIds dawg //overflow prevent
dawg.lastId
let revPrefixStates, revSuffix = reversePrefix dawg.root word []
if revSuffix.Length > 0 then
let noConfluence =
splitByConfluence dawg revPrefixStates []
|> snd
|> List.isEmpty
let new_state = (createSuffixStates dawg revSuffix id).Value
let revPrefixStates, new_state =
if noConfluence then revPrefixStates, new_state
else
let states = List.rev revPrefixStates
let revPrefixStates, path = splitByConfluence dawg states []
let path = path |> List.rev
let mutable prev_state = new_state
for (letter, state) in path do
let currentState = state |> clone (id())
setEdge currentState letter.Value prev_state
prev_state <- getMinimized dawg currentState
revPrefixStates, prev_state
let rec changeRightLanguage (revPrefixStates : list<option<'a> * State<'a>>) prev_state =
match revPrefixStates with
| [ (Some(letter), root) ] -> setEdge root letter prev_state
| (Some(letter), state) :: tail ->
if state.edges.ContainsKey letter && state.edges.[letter] === prev_state then ()
else
state
|> key
|> dawg.register.Remove
|> ignore
setEdge state letter prev_state
changeRightLanguage tail (getMinimized dawg state)
| _ -> failwith "unreachable code"
changeRightLanguage revPrefixStates new_state
let nodeCount dawg =
dawg.register
|> Seq.length
|> (+) 1
let edgeCount dawg =
dawg.register
|> Seq.map (fun node -> node.Value.edges |> Seq.length)
|> Seq.sum
|> (+) dawg.root.edges.Count
let inEdgeCount dawg =
dawg.register
|> Seq.map (fun node -> node.Value.inEdges)
|> Seq.sum
let getPaths dawg word =
let rec getPathsin node : seq<seq<'a>> =
node.edges
|> Seq.map (fun kv -> (kv.Key, getPathsin kv.Value))
|> Seq.collect (fun (head, tails) ->
seq {
if node.final then yield seq { yield head }
yield! tails |> Seq.map (fun path ->
seq {
yield head
yield! path
})
})
match getNode dawg.root word with
| Some(node) ->
seq {
if node.final then yield (word |> Seq.ofList)
yield! getPathsin node |> Seq.map (fun path ->
seq {
yield! word
yield! path
})
}
| None -> Seq.empty
let contains dawg word =
let node = getNode dawg.root word
match node with
| Some(node) -> node.final
| None -> false
let save dawg filename =
compactIds dawg
use wr = new StreamWriter(filename, false)
for fc in dawg.register do
let node = fc.Value
if fc.Value.edges.Count = 0 then sprintf "%d\t%b" node.id node.final |> wr.WriteLine
else
let nodes =
fc.Value.edges
|> Seq.map (fun kv -> sprintf "%s\t%d" (kv.Key.ToString()) kv.Value.id)
|> String.concat "\t"
sprintf "%d\t%b\t%s" node.id node.final nodes |> wr.WriteLine
let nodes =
dawg.root.edges
|> Seq.map (fun kv -> sprintf "%s\t%d" (kv.Key.ToString()) kv.Value.id)
|> String.concat "\t"
sprintf "%d\t%b\t%s" dawg.root.id dawg.root.final nodes |> wr.WriteLine
let load (name : string) parse =
let nodes = new Dictionary<_, _>()
let idMap = new Dictionary<_, _>()
use r = new StreamReader(name)
let mutable rootkey = StateKey.Empty
let mutable lastId = 0u
let getById id =
if idMap.ContainsKey id then idMap.[id]
else
let node =
{ id = id
final = false
edges = new Dictionary<_, _>()
inEdges = 0u
key = StateKey.Empty }
idMap.Add(id, node)
node
while not r.EndOfStream do
let line = r.ReadLine()
if line <> "" then
let chunks = line.Split('\t')
let id = UInt32.Parse(chunks.[0])
let node = getById id
node.final <- Boolean.Parse(chunks.[1])
if node.id > lastId then lastId <- node.id
for i in 2..2..(chunks.Length - 1) do
let childId = UInt32.Parse(chunks.[i + 1])
let child = getById childId
child.inEdges <- child.inEdges + 1u
node.edges.Add(parse chunks.[i], child)
let key = key node
nodes.Add(key, node)
if node.id = 0u then rootkey <- key
let root = nodes.[rootkey]
nodes.Remove rootkey |> ignore
{ Dawg.register = nodes
root = root
lastId = lastId }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment