Created
December 3, 2020 13:21
-
-
Save Drup/05bc9d0d5c92593ba2f16fbe3d5fa782 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
type atom = Char.t | |
let compare_atom = Char.compare | |
module AMap = CCMap.Make(Char) | |
(** Regular expressions *) | |
module Re = struct | |
module rec Internal : sig | |
type t = | |
| Epsilon | |
| Atom of atom | |
| Concat of t list | |
| Alt of Set.t | |
| Inter of Set.t | |
| Rep of int * int option * t | |
include CCSet.OrderedType with type t := t | |
end = struct | |
type t = | |
| Epsilon | |
| Atom of atom | |
| Concat of t list | |
| Alt of Set.t | |
| Inter of Set.t | |
| Rep of int * int option * t | |
[@@deriving ord] | |
end | |
and Set : CCSet.S with type elt = Internal.t = CCSet.Make (Internal) | |
include Internal | |
module Map = Map.Make(Internal) | |
let equal x y = compare x y = 0 | |
let epsilon = Epsilon | |
let void = Alt Set.empty | |
let atom s = Atom s | |
let string s = Concat (List.map atom @@ CCString.to_list s) | |
let char c = atom c | |
let charset cs = Alt (Set.of_list @@ List.map char cs) | |
let enumerate c1 c2 = | |
if c1 > c2 then None | |
else | |
let rec aux i m = | |
if i > m then [] | |
else Char.chr i :: aux (i+1) m | |
in | |
Some (aux (Char.code c1) (Char.code c2)) | |
let any = | |
charset @@ CCOpt.get_exn @@ enumerate (Char.chr 0) (Char.chr 255) | |
let concat l = | |
let rec aux = function | |
| [] -> [] | |
| Concat l :: l' -> aux (l @ l') | |
| Epsilon :: l -> aux l | |
(* | Atom s :: Atom s' :: l -> | |
* aux (atom (s ^ s') :: l) *) | |
| Alt l :: _ when Set.is_empty l -> [void] | |
| x :: l -> x :: aux l | |
in | |
match aux l with | |
| [] -> Epsilon | |
| [x] -> x | |
| l -> Concat l | |
let alt l = | |
let rec aux acc = function | |
| [] -> acc | |
| Alt l :: rest -> aux (Set.union l acc) rest | |
| x :: l -> aux (Set.add x acc) l | |
in | |
let s = aux Set.empty l in | |
if Set.cardinal s = 1 then | |
Set.choose s | |
else | |
Alt s | |
let inter l = | |
let rec aux acc = function | |
| [] -> acc | |
| Alt l :: _ when Set.is_empty l -> Set.empty | |
| x :: l -> aux (Set.add x acc) l | |
in | |
let s = aux Set.empty l in | |
let size = Set.cardinal s in | |
if size = 0 then | |
void | |
else if size = 1 then | |
Set.choose s | |
else | |
Inter s | |
let rec rep i j x = match i, j, x with | |
| 0, Some 0, _ -> Epsilon | |
| 1, Some 1, x -> x | |
| _, _, Epsilon -> epsilon | |
| _, _, Alt l when Set.is_empty l -> epsilon | |
| _, _, Rep (i', None, x) | |
| _, None, Rep (i', Some _, x) -> rep (i * i') None x | |
| _, Some j, Rep (i', Some j', x) -> rep (i * i') (Some (j * j')) x | |
| i, j, x -> Rep (i, j, x) | |
let star x = rep 0 None x | |
let plus x = rep 1 None x | |
let opt x = rep 0 (Some 1) x | |
module Infix = struct | |
let ( ||| ) x y = alt [ x ; y ] | |
let ( &&& ) x y = inter [ x ; y ] | |
let ( -.- ) x y = concat [ x ; y ] | |
end | |
end | |
open Re.Infix | |
(** Posix parser, borrowed from Re *) | |
module Posix = struct | |
exception Parse_error | |
exception Not_supported | |
let parse s = | |
let i = ref 0 in | |
let l = String.length s in | |
let eos () = !i = l in | |
let test c = not (eos ()) && s.[!i] = c in | |
let accept c = let r = test c in if r then incr i; r in | |
let get () = let r = s.[!i] in incr i; r in | |
let unget () = decr i in | |
let rec regexp () = regexp' (branch ()) | |
and regexp' left = | |
if accept '|' then regexp' (left ||| (branch ())) | |
else if accept '&' then regexp' (left &&& branch ()) | |
else left | |
and branch () = branch' [] | |
and branch' left = | |
if eos () || test '|' || test '&' || test ')' then Re.concat (List.rev left) | |
else branch' (piece () :: left) | |
and piece () = | |
let r = atom () in | |
if accept '*' then Re.star r else | |
if accept '+' then Re.plus r else | |
if accept '?' then Re.opt r else | |
if accept '{' then | |
match integer () with | |
Some i -> | |
let j = if accept ',' then integer () else Some i in | |
if not (accept '}') then raise Parse_error; | |
begin match j with | |
Some j when j < i -> raise Parse_error | _ -> () | |
end; | |
Re.rep i j r | |
| None -> | |
unget (); r | |
else | |
r | |
and atom () = | |
if accept '.' then begin | |
Re.any | |
end else if accept '(' then begin | |
let r = regexp () in | |
if not (accept ')') then raise Parse_error; | |
r | |
end else | |
if accept '^' then begin | |
raise Not_supported | |
(* if newline then Re.bol else Re.bos *) | |
end else if accept '$' then begin | |
raise Not_supported | |
(* if newline then Re.eol else Re.eos *) | |
end else if accept '[' then begin | |
if accept '^' then | |
raise Not_supported | |
(* Re.diff (Re.compl (bracket [])) (Re.char '\n') *) | |
else | |
Re.charset (bracket []) | |
end else | |
if accept '\\' then begin | |
if eos () then raise Parse_error; | |
match get () with | |
'|' | '&' | '(' | ')' | '*' | '+' | '?' | |
| '[' | '.' | '^' | '$' | '{' | '\\' as c -> Re.char c | |
| _ -> raise Parse_error | |
end else begin | |
if eos () then raise Parse_error; | |
match get () with | |
'*' | '+' | '?' | '{' | '\\' -> raise Parse_error | |
| c -> Re.char c | |
end | |
and integer () = | |
if eos () then None else | |
match get () with | |
'0'..'9' as d -> integer' (Char.code d - Char.code '0') | |
| _ -> unget (); None | |
and integer' i = | |
if eos () then Some i else | |
match get () with | |
'0'..'9' as d -> | |
let i' = 10 * i + (Char.code d - Char.code '0') in | |
if i' < i then raise Parse_error; | |
integer' i' | |
| _ -> | |
unget (); Some i | |
and bracket s = | |
if s <> [] && accept ']' then s else begin | |
let c = char () in | |
if accept '-' then begin | |
if accept ']' then c :: '-' :: s else begin | |
let c' = char () in | |
match Re.enumerate c c' with | |
| None -> raise Parse_error | |
| Some l -> bracket (l @ s) | |
end | |
end else | |
bracket (c :: s) | |
end | |
and char () = | |
if eos () then raise Parse_error; | |
let c = get () in | |
if c = '[' then begin | |
if accept '=' then raise Not_supported | |
else if accept ':' then begin | |
raise Not_supported (*XXX*) | |
end else if accept '.' then begin | |
if eos () then raise Parse_error; | |
let c = get () in | |
if not (accept '.') then raise Not_supported; | |
if not (accept ']') then raise Parse_error; | |
c | |
end else | |
c | |
end else | |
c | |
in | |
let res = regexp () in | |
if not (eos ()) then raise Parse_error; | |
res | |
end | |
(** Derivatives *) | |
type deriv = Re.t AMap.t | |
let rec has_epsilon = function | |
| Re.Epsilon -> true | |
| Atom _ -> false | |
| Concat el -> | |
List.for_all has_epsilon el | |
| Alt el -> | |
Re.Set.exists has_epsilon el | |
| Rep (0, _, _) -> true | |
| Rep (_, _, _) -> false | |
| Inter el -> | |
Re.Set.for_all has_epsilon el | |
let suffix (l : deriv) re : deriv = | |
let f re_c = Re.concat [re_c; re] in | |
AMap.map f l | |
let union : deriv -> deriv -> deriv = | |
AMap.union | |
(fun _c re1 re2 -> Some (re1 ||| re2)) | |
let inter : deriv -> deriv -> deriv = | |
AMap.merge @@ fun _c re1 re2 -> match re1, re2 with | |
| Some re1, Some re2 -> Some (re1 &&& re2) | |
| _, _ -> None | |
let rec heads = function | |
| Re.Epsilon -> AMap.empty | |
| Atom a -> AMap.singleton a Re.epsilon | |
| Concat el -> | |
let rec aux = function | |
| [] -> AMap.empty | |
| e :: t -> | |
let h = suffix (heads e) (Re.concat t) in | |
if has_epsilon e | |
then union h (aux t) | |
else h | |
in | |
aux el | |
| Alt el -> | |
Re.Set.fold (fun e s -> union s (heads e)) el AMap.empty | |
| Rep (i, None, e) -> | |
suffix (heads e) (Re.rep (max 0 (i-1)) None e) | |
| Rep (i, Some j, e) -> | |
suffix (heads e) (Re.rep (max 0 (i-1)) (Some (max 0 (j-1))) e) | |
| Inter el -> | |
Re.Set.fold (fun e s -> inter s (heads e)) el AMap.empty | |
module State : sig | |
type t | |
val pp : t Fmt.t | |
val gen : unit -> t | |
val compare : t -> t -> int | |
val id : t -> int | |
end = struct | |
type t = int | |
let pp fmt x = Fmt.pf fmt "'%i" x | |
let gen = | |
let r = ref 0 in | |
fun () -> | |
incr r ; | |
!r | |
let compare = CCInt.compare | |
let id x = x | |
end | |
module StateMap = CCMap.Make(State) | |
module StateSet = CCSet.Make(State) | |
type automaton = { | |
states : State.t Re.Map.t ; | |
initial : State.t ; | |
transitions : transition StateMap.t ; | |
final : StateSet.t ; | |
} | |
and transition = State.t AMap.t | |
let add_state a re = | |
let v = State.gen () in | |
let a = {a with states = Re.Map.add re v a.states } in | |
a, v | |
let has_state a re = Re.Map.find_opt re a.states | |
let add_transition a st (c,st') = | |
let transitions = | |
StateMap.update st (fun x -> | |
let t = CCOpt.get_or x ~default:AMap.empty in | |
Some (AMap.add c st' t)) | |
a.transitions | |
in | |
{ a with transitions } | |
let rec goto st0 c re a = | |
match has_state a re with | |
| Some st -> | |
add_transition a st0 (c, st) | |
| None -> | |
let a, st = add_state a re in | |
let a = add_transition a st0 (c, st) in | |
explore a (st, re) | |
and explore a (st, re) = | |
let l = heads re in | |
let a = AMap.fold (goto st) l a in | |
a | |
let make re = | |
let st = State.gen () in | |
let a = | |
let initial = st in | |
let states = Re.Map.singleton re initial in | |
let transitions = StateMap.empty in | |
let final = StateSet.empty in | |
{ states ; initial ; transitions ; final } | |
in | |
let a = explore a (st, re) in | |
let final = | |
Re.Map.fold | |
(fun re st set -> if has_epsilon re then StateSet.add st set else set) | |
a.states | |
StateSet.empty | |
in | |
{ a with final } | |
let pp_automaton ppf { initial; states; transitions; final; _ } = | |
let pp_trans ppf code next = | |
Fmt.pf ppf "%c → %a@," code State.pp next | |
in | |
let pp_transtbl ppf st = | |
let transtbl = StateMap.get st transitions in | |
Fmt.pf ppf "@[<v2>%a:%s %s@ " | |
State.pp st | |
(if st = initial then " start" else "") | |
(if StateSet.mem st final then "end" else "") | |
; | |
CCOpt.iter (AMap.iter (pp_trans ppf)) transtbl; | |
Fmt.pf ppf "@]@," | |
in | |
let pp fmt a = | |
Re.Map.iter (fun _ i -> pp_transtbl fmt i) a | |
in | |
Fmt.pf ppf "@ @[<v>%a@]" pp states | |
[@@ocaml.toplevel_printer] | |
type transducer = state array | |
and state = action array | |
and action = { | |
next : int ; | |
emit : Bitv.t option; | |
} | |
(* Sequence of gray code of size n | |
Should be upstreamed to bitv eventually | |
*) | |
let gray l = | |
let first_set v n = | |
let rec lookup i = | |
if i = n then raise Not_found ; | |
if Bitv.unsafe_get v i then i else lookup (i + 1) | |
in | |
lookup 0 | |
in | |
let gray n = | |
let bv = Bitv.create n false in | |
let rec iter () = | |
Seq.Cons ( | |
Bitv.copy bv, | |
fun () -> | |
Bitv.unsafe_set bv 0 (not (Bitv.unsafe_get bv 0)); | |
Seq.Cons (Bitv.copy bv, fun () -> | |
let pos = succ (first_set bv n) in | |
if pos < n then begin | |
Bitv.unsafe_set bv pos (not (Bitv.unsafe_get bv pos)); | |
iter () | |
end | |
else | |
Nil | |
)) | |
in | |
if n > 0 then iter else OSeq.empty | |
in | |
gray l | |
let bitcodes nb = | |
let l = int_of_float @@ ceil (log (float nb) /. log 2.) in | |
gray l | |
let to_compress { states; initial; transitions; _ } : transducer = | |
let init_id = State.id initial in | |
let nb_state = Re.Map.cardinal states in | |
let to_id s = (State.id s - init_id + nb_state) mod nb_state in | |
let td = Array.make nb_state [||] in | |
let handle_state st0 = | |
let id0 = to_id st0 in | |
let a0 = Array.make 256 { next = 0 ; emit = None } in | |
begin | |
match StateMap.get st0 transitions with | |
| None -> () | |
| Some trans when AMap.is_empty trans -> () | |
| Some trans when AMap.cardinal trans = 1 -> | |
let c, st = AMap.choose trans in | |
let id = to_id st in | |
let charcode = Char.code c in | |
a0.(charcode) <- { next = id ; emit = None } | |
| Some trans -> | |
let n = AMap.cardinal trans in | |
let bitcs = bitcodes n in | |
let trans_seq = AMap.to_seq trans in | |
OSeq.iter2 (fun bitc (c, st) -> | |
let id = to_id st in | |
let charcode = Char.code c in | |
a0.(charcode) <- { next = id ; emit = Some bitc } | |
) bitcs trans_seq | |
end; | |
td.(id0) <- a0 | |
in | |
Re.Map.iter (fun _ st -> handle_state st) states; | |
td | |
let compile st = make st |> to_compress | |
let pp_transducer ppf (a : transducer) = | |
let pp_trans ppf code { next; emit } = | |
if next = 0 && emit = None then () | |
else | |
Fmt.pf ppf "%c → %i%a@," (Char.chr code) next | |
Fmt.(option (fun fmt -> pf fmt " / %a" Bitv.L.print)) emit | |
in | |
let pp_transtbl ppf i transtbl = | |
Fmt.pf ppf "@[<v2>%i:@ " i; | |
Array.iteri (pp_trans ppf) transtbl; | |
Fmt.pf ppf "@]@," | |
in | |
let pp fmt a = | |
Array.iteri (fun i x -> pp_transtbl fmt i x) a | |
in | |
Fmt.pf ppf "@[<v>%a@]" pp a | |
[@@ocaml.toplevel_printer] | |
let compress (td : transducer) = | |
let buf = ref (Bitv.create 0 false) in | |
let st = ref 0 in | |
let add_buf bitc = | |
buf := Bitv.append !buf bitc | |
in | |
let on_char c = | |
let code = Char.code c in | |
let { next ; emit } = td.(!st).(code) in | |
begin match emit with | |
| None -> () | |
| Some bitc -> add_buf bitc | |
end; | |
st := next; | |
in | |
(fun s -> Iter.iter on_char s; !buf) | |
let uri_safe_alphabet = | |
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_" | |
let encode alphabet bitc = | |
let (%) b k = if b then 0 else Int.shift_left 1 k in | |
let buf = Buffer.create 4 in | |
let rec aux = function | |
| c1 :: c2 :: c3 :: c4 :: c5 :: c6 :: rest -> | |
let i = c1%5 + c2%4 + c3%3 + c4%2 + c5%1 + c6%0 in | |
Buffer.add_char buf alphabet.[i]; | |
aux rest | |
| [] -> () | |
| l -> | |
Buffer.add_char buf '='; | |
List.iter (fun b -> Buffer.add_char buf (if b then '1' else '0')) l | |
in | |
aux @@ Bitv.fold_right List.cons bitc []; | |
Buffer.contents buf | |
let () = | |
let re = Sys.argv.(1) in | |
let s = Sys.argv.(2) in | |
let re = Posix.parse re in | |
let a = make re in | |
Fmt.epr "Automaton:@.%a" pp_automaton a; | |
let td = to_compress a in | |
Fmt.epr "Transducer:@.%a@." pp_transducer td; | |
let b = compress td @@ CCString.to_iter s in | |
Fmt.pr "Result:@.%a@." Bitv.L.print b; | |
Fmt.pr "Encoded:@.%s@." (encode uri_safe_alphabet b) | |
(* | |
Copyright (C) 2020 Gabriel Radanne <[email protected]> | |
This library is free software; you can redistribute it and/or | |
modify it under the terms of the GNU Lesser General Public | |
License as published by the Free Software Foundation; either | |
version 2 of the License, or (at your option) any later version. | |
This library is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
Lesser General Public License for more details. | |
You should have received a copy of the GNU Lesser General Public | |
License along with this library; if not, write to the Free Software | |
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
*) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment