Last active
December 24, 2015 09:49
-
-
Save hodzanassredin/6779538 to your computer and use it in GitHub Desktop.
naive incremental(unsorted) dawg in fsharp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| namespace Utils | |
| open System | |
| open System.Collections.Generic | |
| open System.IO | |
| open System.Linq | |
| module IncrementalDawg = | |
| type StateKey = string | |
| type State<'a when 'a : equality> = | |
| { mutable id : uint32 | |
| mutable final : bool | |
| edges : Dictionary<'a, State<'a>> | |
| mutable inEdges : uint32 | |
| mutable key : StateKey } | |
| let private (===) x y = x.id = y.id | |
| let emptyState id = | |
| { id = id | |
| final = false | |
| edges = new Dictionary<_, _>() | |
| inEdges = 0u | |
| key = StateKey.Empty } | |
| let rec decreaseInEdge node = | |
| if node.inEdges > 0u then node.inEdges <- node.inEdges - 1u | |
| if node.inEdges = 0u then | |
| for child in node.edges.Values do | |
| decreaseInEdge child | |
| let setEdge parent symbol child = | |
| //assert (child.inEdges < UInt32.MaxValue) | |
| child.inEdges <- child.inEdges + 1u | |
| if parent.edges.ContainsKey symbol then | |
| let prev = parent.edges.[symbol] | |
| //assert (prev.inEdges > 0u) | |
| decreaseInEdge prev | |
| parent.edges.[symbol] <- child | |
| else parent.edges.Add(symbol, child) | |
| parent.key <- StateKey.Empty | |
| let key node : StateKey = | |
| if node.key = StateKey.Empty then | |
| let res = new System.Text.StringBuilder() | |
| if node.final then res.Append("1") |> ignore | |
| else res.Append("0") |> ignore | |
| for (kv) in (node.edges.OrderBy(fun x -> x.Key)) do | |
| res.AppendFormat("_{0}_{1}", kv.Key.ToString(), kv.Value.id.ToString()) |> ignore | |
| node.key <- res.ToString() | |
| node.key | |
| type Dawg<'a when 'a : equality> = | |
| { register : Dictionary<StateKey, State<'a>> | |
| root : State<'a> | |
| mutable lastId : uint32 } | |
| let rec reversePrefix (node : State<'a>) (word : list<'a>) (acc : list<option<'a> * State<'a>>) = | |
| match word with | |
| | nextLetter :: suffix when node.edges.ContainsKey nextLetter -> | |
| reversePrefix node.edges.[nextLetter] suffix ((Some(nextLetter), node) :: acc) | |
| | nextLetter :: _ -> ((Some(nextLetter), node) :: acc, word |> List.rev) | |
| | _ -> ((None, node) :: acc, word |> List.rev) | |
| let private isConfluence state = state.inEdges > 1u | |
| let rec private splitByConfluence dawg states acc = | |
| match states with | |
| | (_, state) :: _ when isConfluence state -> acc, states | |
| | head :: tail -> splitByConfluence dawg tail (head :: acc) | |
| | _ -> acc, [] | |
| let rec private getNode state word = | |
| match word with | |
| | letter :: suffix -> | |
| if state.edges.ContainsKey letter then getNode state.edges.[letter] suffix | |
| else None | |
| | _ -> Some(state) | |
| let private clone id node = | |
| for edge in node.edges do | |
| edge.Value.inEdges <- edge.Value.inEdges + 1u | |
| { id = id | |
| final = node.final | |
| edges = new Dictionary<_, _>(node.edges) | |
| inEdges = 0u | |
| key = node.key } | |
| let private getMinimized dawg state = | |
| let k = key state | |
| if k |> dawg.register.ContainsKey then dawg.register.[k] | |
| else | |
| dawg.register.Add(k, state) | |
| state | |
| let empty() = | |
| { register = new Dictionary<_, _>() | |
| root = emptyState 0u | |
| lastId = 0u } | |
| let private createSuffixStates dawg reverseSuffix idgen = | |
| let mutable child = None | |
| let mutable childLetter = None | |
| for letter in reverseSuffix do | |
| let node = | |
| { id = idgen() | |
| final = child.IsNone | |
| edges = new Dictionary<_, _>() | |
| inEdges = 0u | |
| key = StateKey.Empty } | |
| if child.IsSome then setEdge node childLetter.Value child.Value | |
| child <- Some(getMinimized dawg node) | |
| childLetter <- Some(letter) | |
| child | |
| let compactIds dawg = | |
| dawg.lastId <- 0u | |
| let states = | |
| dawg.register | |
| |> Seq.map (fun kv -> | |
| dawg.lastId <- dawg.lastId + 1u | |
| kv.Value.id <- dawg.lastId | |
| kv.Value) | |
| |> List.ofSeq | |
| dawg.register.Clear() | |
| for state in states do | |
| dawg.register.Add(key state, state) | |
| dawg.lastId <- dawg.lastId + 1u | |
| if dawg.lastId = 0u then failwith "id overflow" | |
| let add dawg word : unit = | |
| let id() = | |
| dawg.lastId <- dawg.lastId + 1u | |
| //if dawg.lastId = 0u then compactIds dawg //overflow prevent | |
| dawg.lastId | |
| let revPrefixStates, revSuffix = reversePrefix dawg.root word [] | |
| if revSuffix.Length > 0 then | |
| let noConfluence = | |
| splitByConfluence dawg revPrefixStates [] | |
| |> snd | |
| |> List.isEmpty | |
| let new_state = (createSuffixStates dawg revSuffix id).Value | |
| let revPrefixStates, new_state = | |
| if noConfluence then revPrefixStates, new_state | |
| else | |
| let states = List.rev revPrefixStates | |
| let revPrefixStates, path = splitByConfluence dawg states [] | |
| let path = path |> List.rev | |
| let mutable prev_state = new_state | |
| for (letter, state) in path do | |
| let currentState = state |> clone (id()) | |
| setEdge currentState letter.Value prev_state | |
| prev_state <- getMinimized dawg currentState | |
| revPrefixStates, prev_state | |
| let rec changeRightLanguage (revPrefixStates : list<option<'a> * State<'a>>) prev_state = | |
| match revPrefixStates with | |
| | [ (Some(letter), root) ] -> setEdge root letter prev_state | |
| | (Some(letter), state) :: tail -> | |
| if state.edges.ContainsKey letter && state.edges.[letter] === prev_state then () | |
| else | |
| state | |
| |> key | |
| |> dawg.register.Remove | |
| |> ignore | |
| setEdge state letter prev_state | |
| changeRightLanguage tail (getMinimized dawg state) | |
| | _ -> failwith "unreachable code" | |
| changeRightLanguage revPrefixStates new_state | |
| let nodeCount dawg = | |
| dawg.register | |
| |> Seq.length | |
| |> (+) 1 | |
| let edgeCount dawg = | |
| dawg.register | |
| |> Seq.map (fun node -> node.Value.edges |> Seq.length) | |
| |> Seq.sum | |
| |> (+) dawg.root.edges.Count | |
| let inEdgeCount dawg = | |
| dawg.register | |
| |> Seq.map (fun node -> node.Value.inEdges) | |
| |> Seq.sum | |
| let getPaths dawg word = | |
| let rec getPathsin node : seq<seq<'a>> = | |
| node.edges | |
| |> Seq.map (fun kv -> (kv.Key, getPathsin kv.Value)) | |
| |> Seq.collect (fun (head, tails) -> | |
| seq { | |
| if node.final then yield seq { yield head } | |
| yield! tails |> Seq.map (fun path -> | |
| seq { | |
| yield head | |
| yield! path | |
| }) | |
| }) | |
| match getNode dawg.root word with | |
| | Some(node) -> | |
| seq { | |
| if node.final then yield (word |> Seq.ofList) | |
| yield! getPathsin node |> Seq.map (fun path -> | |
| seq { | |
| yield! word | |
| yield! path | |
| }) | |
| } | |
| | None -> Seq.empty | |
| let contains dawg word = | |
| let node = getNode dawg.root word | |
| match node with | |
| | Some(node) -> node.final | |
| | None -> false | |
| let save dawg filename = | |
| compactIds dawg | |
| use wr = new StreamWriter(filename, false) | |
| for fc in dawg.register do | |
| let node = fc.Value | |
| if fc.Value.edges.Count = 0 then sprintf "%d\t%b" node.id node.final |> wr.WriteLine | |
| else | |
| let nodes = | |
| fc.Value.edges | |
| |> Seq.map (fun kv -> sprintf "%s\t%d" (kv.Key.ToString()) kv.Value.id) | |
| |> String.concat "\t" | |
| sprintf "%d\t%b\t%s" node.id node.final nodes |> wr.WriteLine | |
| let nodes = | |
| dawg.root.edges | |
| |> Seq.map (fun kv -> sprintf "%s\t%d" (kv.Key.ToString()) kv.Value.id) | |
| |> String.concat "\t" | |
| sprintf "%d\t%b\t%s" dawg.root.id dawg.root.final nodes |> wr.WriteLine | |
| let load (name : string) parse = | |
| let nodes = new Dictionary<_, _>() | |
| let idMap = new Dictionary<_, _>() | |
| use r = new StreamReader(name) | |
| let mutable rootkey = StateKey.Empty | |
| let mutable lastId = 0u | |
| let getById id = | |
| if idMap.ContainsKey id then idMap.[id] | |
| else | |
| let node = | |
| { id = id | |
| final = false | |
| edges = new Dictionary<_, _>() | |
| inEdges = 0u | |
| key = StateKey.Empty } | |
| idMap.Add(id, node) | |
| node | |
| while not r.EndOfStream do | |
| let line = r.ReadLine() | |
| if line <> "" then | |
| let chunks = line.Split('\t') | |
| let id = UInt32.Parse(chunks.[0]) | |
| let node = getById id | |
| node.final <- Boolean.Parse(chunks.[1]) | |
| if node.id > lastId then lastId <- node.id | |
| for i in 2..2..(chunks.Length - 1) do | |
| let childId = UInt32.Parse(chunks.[i + 1]) | |
| let child = getById childId | |
| child.inEdges <- child.inEdges + 1u | |
| node.edges.Add(parse chunks.[i], child) | |
| let key = key node | |
| nodes.Add(key, node) | |
| if node.id = 0u then rootkey <- key | |
| let root = nodes.[rootkey] | |
| nodes.Remove rootkey |> ignore | |
| { Dawg.register = nodes | |
| root = root | |
| lastId = lastId } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment