Skip to content

Instantly share code, notes, and snippets.

@hkoba
Last active August 29, 2015 14:01
Show Gist options
  • Save hkoba/f4acfa2d73b192f5e441 to your computer and use it in GitHub Desktop.
Save hkoba/f4acfa2d73b192f5e441 to your computer and use it in GitHub Desktop.
Paragraph reading example in OCaml. (Not enough tested)
/_build
*.native
true:package(core),thread,annot,debugging
true:warn_A, warn_e
(* -*- coding: utf-8 -*- *)
open Core.Std
open Optutil
type t = {
ic: in_channel;
readbuf: string;
mutable pos: int;
mutable finish: int;
mutable eof: bool;
debug: int;
}
let create ?(readsize=256) ?(debug=0) ic =
{ic; debug;
readbuf = String.create readsize;
pos = 0; finish = 0; eof = false}
let is_debug t = t.debug > 0
let debuglog ~msg ?pos ?fin t =
if t.debug >= 1 then begin
let pos = pos // t.pos in
let fin = fin // t.finish in
(match t.debug with
| 1 -> Printf.printf "[%s] pos=%d fin=%d\n" msg pos fin
| _ -> Printf.printf "[%s] pos=%d fin=%d [%s]\n"
msg pos fin (String.sub t.readbuf ~pos
~len:(pos - fin)));
flush stdout
end
let eof t = t.eof
let rewind t =
debuglog ~msg:"rewinding" t;
t.pos <- 0;
t.finish <- 0
let pump t =
debuglog ~msg:"begin pump" t;
if t.finish - t.pos <= 0 && t.pos > 0 then
rewind t;
let len = String.length t.readbuf - t.finish in
let got = In_channel.input t.ic ~buf:t.readbuf ~pos:t.finish ~len in
t.finish <- got + t.finish;
if got = 0 then
t.eof <- true;
debuglog ~msg:"end pump" t;
got
let set_pos ~pos t =
assert(pos <= t.finish); (* pos can be equal to finish. *)
t.pos <- pos
let advance_if ~opt t =
match opt with
| None ->
debuglog ~msg:"not found" t;
false
| Some found ->
set_pos ~pos:found t;
debuglog ~msg:"found and advanced" t;
true
let test_pos ~test ~pos t =
pos < t.finish && test t.readbuf.[pos]
let test_pumped t =
if t.pos < t.finish then
true
else if t.eof then
false
else
(pump t > 0)
let ensure_pumped t =
ignore (test_pumped t)
let shift ?pos t =
debuglog ~msg:"begin shift" t;
let pos = pos // t.pos in
let len = t.finish - pos in
if len > 0 then
String.blit ~src:t.readbuf ~src_pos:pos ~dst:t.readbuf ~dst_pos:0 ~len;
t.pos <- 0;
t.finish <- len;
debuglog ~msg:"end shift" t
let flush ?(fin) ?(set_pos) ~outbuf t =
debuglog ~msg:"begin flush with fin" ?fin t;
let finish = fin // t.finish in
let set_pos = set_pos // finish in
Buffer.add_substring outbuf t.readbuf t.pos (finish - t.pos);
t.pos <- set_pos;
debuglog ~msg:"end flush" t
let to_range t =
(* This automatically pump if needed. *)
if not (test_pumped t) then
None
else
Some(t.readbuf, t.pos, t.finish)
let to_sub t =
match to_range t with
| None -> None
| Some(str, pos, fin) -> Some(str, pos, fin - pos)
let contents t =
match to_sub t with
| None -> None
| Some(str, pos, len) ->
Some(String.sub ~pos ~len str)
(* -*- coding: utf-8 -*- *)
(** Perl5's defined-or operator *)
let (//) opt def = match opt with Some x -> x | None -> def
let (|//) f g x =
match f x with
| Some v -> Some v
| None -> g x
let (&&>>) x f =
match x with
| Some v -> f v
| None -> None
let call_optionally optf =
match optf with
| Some f -> f ()
| None -> ()
let apply_if opt fn v =
match opt with
| None -> v
| Some bound -> fn bound v
let cap_if opt v =
apply_if opt min v
let assigned rvar opt =
match opt with
| None -> false
| Some bound -> (rvar := bound; true)
#!/usr/bin/env perl
# -*- coding: utf-8 -*-
use strict;
use warnings FATAL => qw/all/;
local $/ = "";
1 while <>;
print $., "\n";
#!/usr/bin/env perl
# -*- coding: utf-8 -*-
use strict;
use warnings FATAL => qw/all/;
my ($nPars, $nParChars, $nSepChars) = @ARGV;
print "A" x ($nParChars // 1), "\n" x ($nSepChars // 2) for 1 .. ($nPars // 10);
#!/usr/bin/env perl
# -*- coding: utf-8 -*-
use strict;
use warnings FATAL => qw/all/;
my ($nPars, $nLineChars, $nLines, $nSepChars) = @ARGV;
for (1 .. ($nPars // 10)) {
print "A" x ($nLineChars // 1), "\n" for 1 .. ($nLines // 8);
print "\n" x ($nSepChars // 2);
}
(* -*- coding: utf-8 -*- *)
open Core.Std
module M = ParagraphReader
let () =
if Array.length Sys.argv < 2 then
Printf.printf "Usage: %s FILE\n" Sys.argv.(0)
else
let ic = In_channel.create Sys.argv.(1) in
let reader = M.create ~debug:0 ~readsize:4096 ic in
let npars = ref 0 in
Gc.full_major ();
while Option.is_some (M.read reader) do
incr npars
done;
Printf.printf "%d\n" !npars
(* -*- coding: utf-8 -*- *)
open Core.Std
module ParaReader = struct
let rec iter_until ~f ic =
match In_channel.input_line ic with
| None -> None
| Some line ->
match f line with
| None -> iter_until ~f ic
| Some v -> Some v
let nonempty_line line =
if line = "" then None else Some line
let read ic =
match iter_until ~f:nonempty_line ic with
| None -> None
| Some line ->
let rec loop ic res =
match In_channel.input_line ic with
| None -> Some (String.concat ~sep:"\n" (List.rev res))
| Some line ->
if line = "" then
Some (String.concat ~sep:"\n" (List.rev res))
else
loop ic (line :: res)
in
loop ic [line]
end
module M = ParaReader
let () =
if Array.length Sys.argv < 2 then
Printf.printf "Usage: %s FILE\n" Sys.argv.(0)
else
let ic = In_channel.create Sys.argv.(1) in
let npars = ref 0 in
while Option.is_some (M.read ic) do
incr npars
done;
Printf.printf "%d\n" !npars
(* -*- coding: utf-8 -*- *)
open Core.Std
open Optutil
(*
Paragraph here is text section delimited by \n\n.
- [^\n]
- \n (?! [^\n])
- \n \n
Txt: [^\n] region known not have \n\n
Nl : \n known position of \n
Unk: unknown region. next search target
Gbg: garbage after finish.
Each reading loop tries to find '\n' from Unk region.
if '\n' found, it represents next Nl' region
- Unk starts from pos+1
- Gbg starts from finish
*)
type t = In_buffer.t * Buffer.t
let create ?(readsize=256) ?(debug) ic =
(In_buffer.create ~readsize ?debug ic, Buffer.create readsize)
let readbuf_contents (readbuf, _) =
In_buffer.contents readbuf
let outbuf_contents (_, outbuf) =
Buffer.contents outbuf
let outbuf_length (_, outbuf) =
Buffer.length outbuf
let emit_outbuf (_, outbuf) =
let out = Buffer.contents outbuf in
Buffer.clear outbuf;
out
let lfindi ?(pos=0) ?fin str ~f =
let fin = fin // (String.length str) in
let rec loop str pos fin =
if pos >= fin then
None
else if f str.[pos] then
Some pos
else
loop str (pos+1) fin
in
loop str pos fin
let find_non_newline (readbuf, _) =
let rec lfindi_non_newline pos fin str =
if pos >= fin then
None
else if str.[pos] <> '\n' then
Some pos
else
lfindi_non_newline (pos+1) fin str
in
let rec loop readbuf =
match In_buffer.to_range readbuf with
| None -> false
| Some (str, pos, fin) -> (
let nlpos = lfindi_non_newline pos fin str in
if In_buffer.advance_if ~opt:nlpos readbuf then
true
else if In_buffer.eof readbuf then
false
else
(In_buffer.set_pos ~pos:fin readbuf;
loop readbuf)
)
in
loop readbuf
let find_end_of_paragraph ?(pos=0) ~fin str =
let rec lfindi_newline pos fin str =
if pos >= fin then
None
else if str.[pos] = '\n' then
Some pos
else
lfindi_newline (pos+1) fin str
in
let rec loop pos fin str =
match lfindi_newline pos fin str with
| None -> None
| Some nlpos ->
(if nlpos+1 < fin && str.[nlpos+1] = '\n' then
Some nlpos
else
loop (nlpos+1) fin str
)
in
loop pos fin str
let read t =
let rec loop ((readbuf, outbuf) as t) =
(* Since find_non_newline is called before here,
we can assume we have Some(non newline) or None.
*)
match In_buffer.to_range readbuf with
| None ->
outbuf_length t > 0
| Some (str, pos, fin) ->
match find_end_of_paragraph ~pos ~fin str with
| None -> (
In_buffer.flush ~outbuf readbuf;
loop t
)
| Some eop -> (
In_buffer.flush ~outbuf ~fin:eop ~set_pos:(eop+2) readbuf;
true
)
in
if not (find_non_newline t) then
None
else if (loop t) then
Some (emit_outbuf t)
else
None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment