Skip to content

Instantly share code, notes, and snippets.

@Drup
Created December 3, 2020 13:21
Show Gist options
  • Save Drup/05bc9d0d5c92593ba2f16fbe3d5fa782 to your computer and use it in GitHub Desktop.
Save Drup/05bc9d0d5c92593ba2f16fbe3d5fa782 to your computer and use it in GitHub Desktop.
type atom = Char.t
let compare_atom = Char.compare
module AMap = CCMap.Make(Char)
(** Regular expressions *)
module Re = struct
module rec Internal : sig
type t =
| Epsilon
| Atom of atom
| Concat of t list
| Alt of Set.t
| Inter of Set.t
| Rep of int * int option * t
include CCSet.OrderedType with type t := t
end = struct
type t =
| Epsilon
| Atom of atom
| Concat of t list
| Alt of Set.t
| Inter of Set.t
| Rep of int * int option * t
[@@deriving ord]
end
and Set : CCSet.S with type elt = Internal.t = CCSet.Make (Internal)
include Internal
module Map = Map.Make(Internal)
let equal x y = compare x y = 0
let epsilon = Epsilon
let void = Alt Set.empty
let atom s = Atom s
let string s = Concat (List.map atom @@ CCString.to_list s)
let char c = atom c
let charset cs = Alt (Set.of_list @@ List.map char cs)
let enumerate c1 c2 =
if c1 > c2 then None
else
let rec aux i m =
if i > m then []
else Char.chr i :: aux (i+1) m
in
Some (aux (Char.code c1) (Char.code c2))
let any =
charset @@ CCOpt.get_exn @@ enumerate (Char.chr 0) (Char.chr 255)
let concat l =
let rec aux = function
| [] -> []
| Concat l :: l' -> aux (l @ l')
| Epsilon :: l -> aux l
(* | Atom s :: Atom s' :: l ->
* aux (atom (s ^ s') :: l) *)
| Alt l :: _ when Set.is_empty l -> [void]
| x :: l -> x :: aux l
in
match aux l with
| [] -> Epsilon
| [x] -> x
| l -> Concat l
let alt l =
let rec aux acc = function
| [] -> acc
| Alt l :: rest -> aux (Set.union l acc) rest
| x :: l -> aux (Set.add x acc) l
in
let s = aux Set.empty l in
if Set.cardinal s = 1 then
Set.choose s
else
Alt s
let inter l =
let rec aux acc = function
| [] -> acc
| Alt l :: _ when Set.is_empty l -> Set.empty
| x :: l -> aux (Set.add x acc) l
in
let s = aux Set.empty l in
let size = Set.cardinal s in
if size = 0 then
void
else if size = 1 then
Set.choose s
else
Inter s
let rec rep i j x = match i, j, x with
| 0, Some 0, _ -> Epsilon
| 1, Some 1, x -> x
| _, _, Epsilon -> epsilon
| _, _, Alt l when Set.is_empty l -> epsilon
| _, _, Rep (i', None, x)
| _, None, Rep (i', Some _, x) -> rep (i * i') None x
| _, Some j, Rep (i', Some j', x) -> rep (i * i') (Some (j * j')) x
| i, j, x -> Rep (i, j, x)
let star x = rep 0 None x
let plus x = rep 1 None x
let opt x = rep 0 (Some 1) x
module Infix = struct
let ( ||| ) x y = alt [ x ; y ]
let ( &&& ) x y = inter [ x ; y ]
let ( -.- ) x y = concat [ x ; y ]
end
end
open Re.Infix
(** Posix parser, borrowed from Re *)
module Posix = struct
exception Parse_error
exception Not_supported
let parse s =
let i = ref 0 in
let l = String.length s in
let eos () = !i = l in
let test c = not (eos ()) && s.[!i] = c in
let accept c = let r = test c in if r then incr i; r in
let get () = let r = s.[!i] in incr i; r in
let unget () = decr i in
let rec regexp () = regexp' (branch ())
and regexp' left =
if accept '|' then regexp' (left ||| (branch ()))
else if accept '&' then regexp' (left &&& branch ())
else left
and branch () = branch' []
and branch' left =
if eos () || test '|' || test '&' || test ')' then Re.concat (List.rev left)
else branch' (piece () :: left)
and piece () =
let r = atom () in
if accept '*' then Re.star r else
if accept '+' then Re.plus r else
if accept '?' then Re.opt r else
if accept '{' then
match integer () with
Some i ->
let j = if accept ',' then integer () else Some i in
if not (accept '}') then raise Parse_error;
begin match j with
Some j when j < i -> raise Parse_error | _ -> ()
end;
Re.rep i j r
| None ->
unget (); r
else
r
and atom () =
if accept '.' then begin
Re.any
end else if accept '(' then begin
let r = regexp () in
if not (accept ')') then raise Parse_error;
r
end else
if accept '^' then begin
raise Not_supported
(* if newline then Re.bol else Re.bos *)
end else if accept '$' then begin
raise Not_supported
(* if newline then Re.eol else Re.eos *)
end else if accept '[' then begin
if accept '^' then
raise Not_supported
(* Re.diff (Re.compl (bracket [])) (Re.char '\n') *)
else
Re.charset (bracket [])
end else
if accept '\\' then begin
if eos () then raise Parse_error;
match get () with
'|' | '&' | '(' | ')' | '*' | '+' | '?'
| '[' | '.' | '^' | '$' | '{' | '\\' as c -> Re.char c
| _ -> raise Parse_error
end else begin
if eos () then raise Parse_error;
match get () with
'*' | '+' | '?' | '{' | '\\' -> raise Parse_error
| c -> Re.char c
end
and integer () =
if eos () then None else
match get () with
'0'..'9' as d -> integer' (Char.code d - Char.code '0')
| _ -> unget (); None
and integer' i =
if eos () then Some i else
match get () with
'0'..'9' as d ->
let i' = 10 * i + (Char.code d - Char.code '0') in
if i' < i then raise Parse_error;
integer' i'
| _ ->
unget (); Some i
and bracket s =
if s <> [] && accept ']' then s else begin
let c = char () in
if accept '-' then begin
if accept ']' then c :: '-' :: s else begin
let c' = char () in
match Re.enumerate c c' with
| None -> raise Parse_error
| Some l -> bracket (l @ s)
end
end else
bracket (c :: s)
end
and char () =
if eos () then raise Parse_error;
let c = get () in
if c = '[' then begin
if accept '=' then raise Not_supported
else if accept ':' then begin
raise Not_supported (*XXX*)
end else if accept '.' then begin
if eos () then raise Parse_error;
let c = get () in
if not (accept '.') then raise Not_supported;
if not (accept ']') then raise Parse_error;
c
end else
c
end else
c
in
let res = regexp () in
if not (eos ()) then raise Parse_error;
res
end
(** Derivatives *)
type deriv = Re.t AMap.t
let rec has_epsilon = function
| Re.Epsilon -> true
| Atom _ -> false
| Concat el ->
List.for_all has_epsilon el
| Alt el ->
Re.Set.exists has_epsilon el
| Rep (0, _, _) -> true
| Rep (_, _, _) -> false
| Inter el ->
Re.Set.for_all has_epsilon el
let suffix (l : deriv) re : deriv =
let f re_c = Re.concat [re_c; re] in
AMap.map f l
let union : deriv -> deriv -> deriv =
AMap.union
(fun _c re1 re2 -> Some (re1 ||| re2))
let inter : deriv -> deriv -> deriv =
AMap.merge @@ fun _c re1 re2 -> match re1, re2 with
| Some re1, Some re2 -> Some (re1 &&& re2)
| _, _ -> None
let rec heads = function
| Re.Epsilon -> AMap.empty
| Atom a -> AMap.singleton a Re.epsilon
| Concat el ->
let rec aux = function
| [] -> AMap.empty
| e :: t ->
let h = suffix (heads e) (Re.concat t) in
if has_epsilon e
then union h (aux t)
else h
in
aux el
| Alt el ->
Re.Set.fold (fun e s -> union s (heads e)) el AMap.empty
| Rep (i, None, e) ->
suffix (heads e) (Re.rep (max 0 (i-1)) None e)
| Rep (i, Some j, e) ->
suffix (heads e) (Re.rep (max 0 (i-1)) (Some (max 0 (j-1))) e)
| Inter el ->
Re.Set.fold (fun e s -> inter s (heads e)) el AMap.empty
module State : sig
type t
val pp : t Fmt.t
val gen : unit -> t
val compare : t -> t -> int
val id : t -> int
end = struct
type t = int
let pp fmt x = Fmt.pf fmt "'%i" x
let gen =
let r = ref 0 in
fun () ->
incr r ;
!r
let compare = CCInt.compare
let id x = x
end
module StateMap = CCMap.Make(State)
module StateSet = CCSet.Make(State)
type automaton = {
states : State.t Re.Map.t ;
initial : State.t ;
transitions : transition StateMap.t ;
final : StateSet.t ;
}
and transition = State.t AMap.t
let add_state a re =
let v = State.gen () in
let a = {a with states = Re.Map.add re v a.states } in
a, v
let has_state a re = Re.Map.find_opt re a.states
let add_transition a st (c,st') =
let transitions =
StateMap.update st (fun x ->
let t = CCOpt.get_or x ~default:AMap.empty in
Some (AMap.add c st' t))
a.transitions
in
{ a with transitions }
let rec goto st0 c re a =
match has_state a re with
| Some st ->
add_transition a st0 (c, st)
| None ->
let a, st = add_state a re in
let a = add_transition a st0 (c, st) in
explore a (st, re)
and explore a (st, re) =
let l = heads re in
let a = AMap.fold (goto st) l a in
a
let make re =
let st = State.gen () in
let a =
let initial = st in
let states = Re.Map.singleton re initial in
let transitions = StateMap.empty in
let final = StateSet.empty in
{ states ; initial ; transitions ; final }
in
let a = explore a (st, re) in
let final =
Re.Map.fold
(fun re st set -> if has_epsilon re then StateSet.add st set else set)
a.states
StateSet.empty
in
{ a with final }
let pp_automaton ppf { initial; states; transitions; final; _ } =
let pp_trans ppf code next =
Fmt.pf ppf "%c → %a@," code State.pp next
in
let pp_transtbl ppf st =
let transtbl = StateMap.get st transitions in
Fmt.pf ppf "@[<v2>%a:%s %s@ "
State.pp st
(if st = initial then " start" else "")
(if StateSet.mem st final then "end" else "")
;
CCOpt.iter (AMap.iter (pp_trans ppf)) transtbl;
Fmt.pf ppf "@]@,"
in
let pp fmt a =
Re.Map.iter (fun _ i -> pp_transtbl fmt i) a
in
Fmt.pf ppf "@ @[<v>%a@]" pp states
[@@ocaml.toplevel_printer]
type transducer = state array
and state = action array
and action = {
next : int ;
emit : Bitv.t option;
}
(* Sequence of gray code of size n
Should be upstreamed to bitv eventually
*)
let gray l =
let first_set v n =
let rec lookup i =
if i = n then raise Not_found ;
if Bitv.unsafe_get v i then i else lookup (i + 1)
in
lookup 0
in
let gray n =
let bv = Bitv.create n false in
let rec iter () =
Seq.Cons (
Bitv.copy bv,
fun () ->
Bitv.unsafe_set bv 0 (not (Bitv.unsafe_get bv 0));
Seq.Cons (Bitv.copy bv, fun () ->
let pos = succ (first_set bv n) in
if pos < n then begin
Bitv.unsafe_set bv pos (not (Bitv.unsafe_get bv pos));
iter ()
end
else
Nil
))
in
if n > 0 then iter else OSeq.empty
in
gray l
let bitcodes nb =
let l = int_of_float @@ ceil (log (float nb) /. log 2.) in
gray l
let to_compress { states; initial; transitions; _ } : transducer =
let init_id = State.id initial in
let nb_state = Re.Map.cardinal states in
let to_id s = (State.id s - init_id + nb_state) mod nb_state in
let td = Array.make nb_state [||] in
let handle_state st0 =
let id0 = to_id st0 in
let a0 = Array.make 256 { next = 0 ; emit = None } in
begin
match StateMap.get st0 transitions with
| None -> ()
| Some trans when AMap.is_empty trans -> ()
| Some trans when AMap.cardinal trans = 1 ->
let c, st = AMap.choose trans in
let id = to_id st in
let charcode = Char.code c in
a0.(charcode) <- { next = id ; emit = None }
| Some trans ->
let n = AMap.cardinal trans in
let bitcs = bitcodes n in
let trans_seq = AMap.to_seq trans in
OSeq.iter2 (fun bitc (c, st) ->
let id = to_id st in
let charcode = Char.code c in
a0.(charcode) <- { next = id ; emit = Some bitc }
) bitcs trans_seq
end;
td.(id0) <- a0
in
Re.Map.iter (fun _ st -> handle_state st) states;
td
let compile st = make st |> to_compress
let pp_transducer ppf (a : transducer) =
let pp_trans ppf code { next; emit } =
if next = 0 && emit = None then ()
else
Fmt.pf ppf "%c → %i%a@," (Char.chr code) next
Fmt.(option (fun fmt -> pf fmt " / %a" Bitv.L.print)) emit
in
let pp_transtbl ppf i transtbl =
Fmt.pf ppf "@[<v2>%i:@ " i;
Array.iteri (pp_trans ppf) transtbl;
Fmt.pf ppf "@]@,"
in
let pp fmt a =
Array.iteri (fun i x -> pp_transtbl fmt i x) a
in
Fmt.pf ppf "@[<v>%a@]" pp a
[@@ocaml.toplevel_printer]
let compress (td : transducer) =
let buf = ref (Bitv.create 0 false) in
let st = ref 0 in
let add_buf bitc =
buf := Bitv.append !buf bitc
in
let on_char c =
let code = Char.code c in
let { next ; emit } = td.(!st).(code) in
begin match emit with
| None -> ()
| Some bitc -> add_buf bitc
end;
st := next;
in
(fun s -> Iter.iter on_char s; !buf)
let uri_safe_alphabet =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
let encode alphabet bitc =
let (%) b k = if b then 0 else Int.shift_left 1 k in
let buf = Buffer.create 4 in
let rec aux = function
| c1 :: c2 :: c3 :: c4 :: c5 :: c6 :: rest ->
let i = c1%5 + c2%4 + c3%3 + c4%2 + c5%1 + c6%0 in
Buffer.add_char buf alphabet.[i];
aux rest
| [] -> ()
| l ->
Buffer.add_char buf '=';
List.iter (fun b -> Buffer.add_char buf (if b then '1' else '0')) l
in
aux @@ Bitv.fold_right List.cons bitc [];
Buffer.contents buf
let () =
let re = Sys.argv.(1) in
let s = Sys.argv.(2) in
let re = Posix.parse re in
let a = make re in
Fmt.epr "Automaton:@.%a" pp_automaton a;
let td = to_compress a in
Fmt.epr "Transducer:@.%a@." pp_transducer td;
let b = compress td @@ CCString.to_iter s in
Fmt.pr "Result:@.%a@." Bitv.L.print b;
Fmt.pr "Encoded:@.%s@." (encode uri_safe_alphabet b)
(*
Copyright (C) 2020 Gabriel Radanne <[email protected]>
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment