Last active
December 25, 2015 14:39
-
-
Save hodzanassredin/6993020 to your computer and use it in GitHub Desktop.
incrDawgImmutable
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| namespace Utils | |
| open System | |
| open System.Collections.Generic | |
| open System.IO | |
| open System.Linq | |
| module IncrementalDawg = | |
| type StateKey = string | |
| type State<'a when 'a : comparison> = | |
| { id : uint32; final : bool; edges : Map<'a, State<'a>>; key : StateKey } | |
| let keygen final (edges:Map<'a, State<'a>>) : StateKey = | |
| let res = new System.Text.StringBuilder() | |
| if final then res.Append("1") |> ignore | |
| else res.Append("0") |> ignore | |
| for (kv) in (edges.OrderBy(fun x -> x.Key)) do | |
| res.AppendFormat("_{0}_{1}", kv.Key.ToString(), kv.Value.id.ToString()) |> ignore | |
| res.ToString() | |
| let emptyState() = { id = 0u; final = false; edges = Map.empty; key = (keygen false Map.empty) } | |
| type Wr<'a> = {r:WeakReference} | |
| let box (state:State<'a>) = {r=new WeakReference(state)} | |
| let unbox<'a when 'a : comparison> (wr:Wr<'a>) : option<State<'a>> = if wr.r.IsAlive then Some(wr.r.Target :?> State<'a>) else None | |
| type Dawg<'a when 'a : comparison> = | |
| { register : Dictionary<StateKey, Wr<'a>>; root : State<'a>; lastId : uint32 } | |
| let getLiveStates<'a when 'a : comparison> (register:Dictionary<StateKey, Wr<'a>>) = | |
| register | |
| |> Seq.map (fun kv-> (unbox kv.Value)) | |
| |> Seq.choose (fun v -> v) | |
| let rec reversePrefix (node : State<'a>) (word : list<'a>) (acc : list<option<'a> * State<'a>>) = | |
| match word with | |
| | nextLetter :: suffix when node.edges.ContainsKey nextLetter -> | |
| reversePrefix node.edges.[nextLetter] suffix ((Some(nextLetter), node) :: acc) | |
| | nextLetter :: suffix -> ((Some(nextLetter), node) :: acc, word |> List.rev) | |
| | [] -> ((None, node) :: acc, []) | |
| let rec private getNode state word = | |
| match word with | |
| | letter :: suffix when state.edges.ContainsKey letter-> getNode state.edges.[letter] suffix | |
| | letter :: suffix -> None | |
| | _ -> Some(state) | |
| let private getMinimized dawg state = | |
| let k = state.key | |
| if k |> dawg.register.ContainsKey then | |
| match unbox dawg.register.[k] with | |
| | Some(state) -> state | |
| | None -> dawg.register.Remove(k) |> ignore | |
| dawg.register.Add(k, box state) | |
| state | |
| else | |
| dawg.register.Add(k, box state) | |
| state | |
| let private clone dawg id node childNletter = | |
| let e, final = match childNletter with | |
| |Some(letter, child) -> node.edges.Add(letter, child), node.final | |
| |None -> node.edges, true | |
| let newNode = { id = id + 1u; final = final; edges = e; key = keygen final e} | |
| let min = getMinimized dawg newNode | |
| min, if min.id = newNode.id then newNode.id else id | |
| let empty() = { register = new Dictionary<_, _>(); root = emptyState(); lastId = 0u } | |
| let private createSuffixStates dawg (reverseSuffix:list<'a>) lastId = | |
| let rec createSuffixStatesin dawg (reverseSuffix:list<'a>) lastId (child:option<State<'a>>) = | |
| match (reverseSuffix,child) with | |
| | (letter :: prefix, _) -> | |
| let child = match child with | |
| |Some(c) -> (Some(letter,c)) | |
| |None -> None | |
| let node, lastId = clone dawg lastId (emptyState()) child | |
| createSuffixStatesin dawg prefix lastId (Some(node)) | |
| | ([], child) -> child, lastId | |
| | _ -> failwith "unreachable code" | |
| createSuffixStatesin dawg reverseSuffix lastId None | |
| let rec clonenodes dawg revstates (child:option<State<'a>>) lastId = | |
| match (revstates, child) with | |
| | ((Some(letter),state) :: prefix, Some(c)) -> | |
| let currentState, lastId = clone dawg lastId state (Some(letter, c)) | |
| clonenodes dawg prefix (Some(currentState)) lastId | |
| | ((None, state) :: [], None) when state.final -> state, lastId | |
| | ((None, state) :: [], None) -> clone dawg lastId state None | |
| | ([], Some(c)) -> c, lastId | |
| | _ -> failwith "unreachable code" | |
| let add (word:list<'a>) (dawg:Dawg<'a>) : Dawg<'a> = | |
| let revPrefixStates, revSuffix = reversePrefix dawg.root word [] | |
| if revSuffix.Length = 0 && (revPrefixStates.Head |> snd).final then dawg | |
| else let new_state, lastId = createSuffixStates dawg revSuffix dawg.lastId | |
| let prev_state, lastId = clonenodes dawg revPrefixStates new_state lastId | |
| {dawg with root = prev_state; lastId = lastId} | |
| let compact dawg = | |
| GC.Collect() | |
| let garbage = dawg.register.Keys |> Seq.where (fun key -> not dawg.register.[key].r.IsAlive) |> List.ofSeq | |
| for key in garbage do | |
| dawg.register.Remove key |> ignore | |
| let compactIdMap dawg = | |
| let lastId = ref 0u | |
| let idmap = new Dictionary<_,_>() | |
| let ids = dawg.register | |
| |> getLiveStates | |
| |> Seq.map (fun v -> lastId := !lastId + 1u | |
| (v.id, !lastId)) | |
| for (id,compactId) in ids do | |
| idmap.Add(id, compactId) | |
| idmap | |
| let nodeCount dawg = dawg.register |> Seq.length |> (+) 1 | |
| let edgeCount dawg = | |
| dawg.register | |
| |> getLiveStates | |
| |> Seq.map (fun node -> node.edges |> Seq.length) | |
| |> Seq.sum | |
| |> (+) dawg.root.edges.Count | |
| let getPaths dawg word = | |
| let rec getPathsin node : seq<seq<'a>> = | |
| node.edges | |
| |> Seq.map (fun kv -> (kv.Key, getPathsin kv.Value)) | |
| |> Seq.collect (fun (head, tails) -> | |
| seq { | |
| if node.final then yield seq { yield head } | |
| yield! tails |> Seq.map (fun path -> | |
| seq { | |
| yield head | |
| yield! path | |
| }) | |
| }) | |
| match getNode dawg.root word with | |
| | Some(node) -> | |
| seq { | |
| if node.final then yield (word |> Seq.ofList) | |
| yield! getPathsin node |> Seq.map (fun path -> | |
| seq { | |
| yield! word | |
| yield! path | |
| }) | |
| } | |
| | None -> Seq.empty | |
| let contains dawg word = | |
| match getNode dawg.root word with | |
| | Some(node) -> node.final | |
| | None -> false | |
| let save dawg filename = | |
| let idMap = compactIdMap dawg | |
| use wr = new StreamWriter(filename, false) | |
| for node in getLiveStates dawg.register do | |
| if node.edges.Count = 0 then sprintf "%d\t%b" idMap.[node.id] node.final |> wr.WriteLine | |
| else | |
| let nodes = | |
| node.edges | |
| |> Seq.map (fun kv -> sprintf "%s\t%d" (kv.Key.ToString()) idMap.[kv.Value.id]) | |
| |> String.concat "\t" | |
| sprintf "%d\t%b\t%s" idMap.[node.id] node.final nodes |> wr.WriteLine | |
| // let nodes = | |
| // dawg.root.edges | |
| // |> Seq.map (fun kv -> sprintf "%s\t%d" (kv.Key.ToString()) idMap.[kv.Value.id]) | |
| // |> String.concat "\t" | |
| // sprintf "%d\t%b\t%s" dawg.root.id dawg.root.final nodes |> wr.WriteLine | |
| let load (name : string) parse = | |
| let nodes = new Dictionary<_, _>() | |
| let idMap = new Dictionary<_, _>() | |
| use r = new StreamReader(name) | |
| let mutable lastId = 0u | |
| let mutable lastNode = emptyState() | |
| while not r.EndOfStream do | |
| let line = r.ReadLine() | |
| if line <> "" then | |
| let chunks = line.Split('\t') | |
| let id = UInt32.Parse(chunks.[0]) | |
| if id > lastId then lastId <- id | |
| let final = Boolean.Parse(chunks.[1]) | |
| let edges = [2..2..(chunks.Length - 1)] | |
| |> Seq.fold(fun (m:Map<'a, State<'a>>) i -> | |
| let childId = UInt32.Parse(chunks.[i + 1]) | |
| let child = idMap.[childId] | |
| m.Add(parse chunks.[i], child)) Map.empty | |
| let node = {id = id; final = final; edges = edges; key = keygen final edges} | |
| idMap.Add(id, node) | |
| let key = keygen final edges | |
| nodes.Add(node.key, box node) | |
| lastNode <- node | |
| { Dawg.register = nodes; root = lastNode; lastId = lastId } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Learn more about F# at http://fsharp.net | |
| // See the 'F# Tutorial' project for more help. | |
| //lastlines inserted 2000000 | |
| //edgeCount 111 | |
| //nodeCount 13 | |
| //max in edges count 1800001 | |
| //Elapsed secs 13.259920 | |
| open Utils.IncrementalDawg | |
| open System.Diagnostics | |
| open System.Numerics | |
| open System | |
| [<EntryPoint>] | |
| let main argv = | |
| let item (str:string) = str.ToCharArray() |> List.ofArray | |
| let mutable WordCount = 0 | |
| let max = 100 | |
| let words = [0..max] |> List.map (fun x -> x.ToString() |> item) | |
| let stopWatch = Stopwatch.StartNew() | |
| let rec addWords words dawg = | |
| match words with | |
| | word :: words -> add word dawg |> addWords words | |
| | _ -> dawg | |
| let e : Dawg<Char> = empty() | |
| let dawg : Dawg<Char> = e |> add List.empty |> addWords words | |
| // let ec = edgeCount dawg | |
| // let inec = int(inEdgeCount dawg) | |
| // assert (ec = inec) | |
| //if ( WordCount % 100 ) = 0 then printfn "inserted words %d" WordCount | |
| stopWatch.Stop() | |
| compact dawg | |
| max |> printfn "lines inserted %d" | |
| dawg |> edgeCount |> printfn "edgeCount %d" | |
| dawg |> nodeCount |> printfn "nodeCount %d" | |
| printfn "Elapsed secs %f" stopWatch.Elapsed.TotalSeconds | |
| // for (_,word) in words do | |
| // assert (word |> contains dawg) | |
| // assert ("1" |> item |> contains dawg) | |
| // assert ((max+1).ToString() |> item |> contains dawg |> not) | |
| save dawg "test.dawg" | |
| let dawg = load "test.dawg" Char.Parse | |
| for word in words do | |
| assert (word |> contains dawg) | |
| assert ("1" |> item |> contains dawg) | |
| assert ((max+1).ToString() |> item |> contains dawg |> not) | |
| let t = (max /100).ToString() | |
| for path in ( t |> item |> getPaths dawg) do | |
| printfn "%A" path | |
| //4.2 | |
| //8.17 405 | |
| 0 // return an integer exit code |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment