Last active
August 29, 2015 14:01
-
-
Save hkoba/f4acfa2d73b192f5e441 to your computer and use it in GitHub Desktop.
Paragraph reading example in OCaml. (Not enough tested)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/_build | |
*.native |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* -*- coding: utf-8 -*- *) | |
open Core.Std | |
open Optutil | |
type t = { | |
ic: in_channel; | |
readbuf: string; | |
mutable pos: int; | |
mutable finish: int; | |
mutable eof: bool; | |
debug: int; | |
} | |
let create ?(readsize=256) ?(debug=0) ic = | |
{ic; debug; | |
readbuf = String.create readsize; | |
pos = 0; finish = 0; eof = false} | |
let is_debug t = t.debug > 0 | |
let debuglog ~msg ?pos ?fin t = | |
if t.debug >= 1 then begin | |
let pos = pos // t.pos in | |
let fin = fin // t.finish in | |
(match t.debug with | |
| 1 -> Printf.printf "[%s] pos=%d fin=%d\n" msg pos fin | |
| _ -> Printf.printf "[%s] pos=%d fin=%d [%s]\n" | |
msg pos fin (String.sub t.readbuf ~pos | |
~len:(pos - fin))); | |
flush stdout | |
end | |
let eof t = t.eof | |
let rewind t = | |
debuglog ~msg:"rewinding" t; | |
t.pos <- 0; | |
t.finish <- 0 | |
let pump t = | |
debuglog ~msg:"begin pump" t; | |
if t.finish - t.pos <= 0 && t.pos > 0 then | |
rewind t; | |
let len = String.length t.readbuf - t.finish in | |
let got = In_channel.input t.ic ~buf:t.readbuf ~pos:t.finish ~len in | |
t.finish <- got + t.finish; | |
if got = 0 then | |
t.eof <- true; | |
debuglog ~msg:"end pump" t; | |
got | |
let set_pos ~pos t = | |
assert(pos <= t.finish); (* pos can be equal to finish. *) | |
t.pos <- pos | |
let advance_if ~opt t = | |
match opt with | |
| None -> | |
debuglog ~msg:"not found" t; | |
false | |
| Some found -> | |
set_pos ~pos:found t; | |
debuglog ~msg:"found and advanced" t; | |
true | |
let test_pos ~test ~pos t = | |
pos < t.finish && test t.readbuf.[pos] | |
let test_pumped t = | |
if t.pos < t.finish then | |
true | |
else if t.eof then | |
false | |
else | |
(pump t > 0) | |
let ensure_pumped t = | |
ignore (test_pumped t) | |
let shift ?pos t = | |
debuglog ~msg:"begin shift" t; | |
let pos = pos // t.pos in | |
let len = t.finish - pos in | |
if len > 0 then | |
String.blit ~src:t.readbuf ~src_pos:pos ~dst:t.readbuf ~dst_pos:0 ~len; | |
t.pos <- 0; | |
t.finish <- len; | |
debuglog ~msg:"end shift" t | |
let flush ?(fin) ?(set_pos) ~outbuf t = | |
debuglog ~msg:"begin flush with fin" ?fin t; | |
let finish = fin // t.finish in | |
let set_pos = set_pos // finish in | |
Buffer.add_substring outbuf t.readbuf t.pos (finish - t.pos); | |
t.pos <- set_pos; | |
debuglog ~msg:"end flush" t | |
let to_range t = | |
(* This automatically pump if needed. *) | |
if not (test_pumped t) then | |
None | |
else | |
Some(t.readbuf, t.pos, t.finish) | |
let to_sub t = | |
match to_range t with | |
| None -> None | |
| Some(str, pos, fin) -> Some(str, pos, fin - pos) | |
let contents t = | |
match to_sub t with | |
| None -> None | |
| Some(str, pos, len) -> | |
Some(String.sub ~pos ~len str) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* -*- coding: utf-8 -*- *) | |
(** Perl5's defined-or operator *) | |
let (//) opt def = match opt with Some x -> x | None -> def | |
let (|//) f g x = | |
match f x with | |
| Some v -> Some v | |
| None -> g x | |
let (&&>>) x f = | |
match x with | |
| Some v -> f v | |
| None -> None | |
let call_optionally optf = | |
match optf with | |
| Some f -> f () | |
| None -> () | |
let apply_if opt fn v = | |
match opt with | |
| None -> v | |
| Some bound -> fn bound v | |
let cap_if opt v = | |
apply_if opt min v | |
let assigned rvar opt = | |
match opt with | |
| None -> false | |
| Some bound -> (rvar := bound; true) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# -*- coding: utf-8 -*- | |
use strict; | |
use warnings FATAL => qw/all/; | |
local $/ = ""; | |
1 while <>; | |
print $., "\n"; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# -*- coding: utf-8 -*- | |
use strict; | |
use warnings FATAL => qw/all/; | |
my ($nPars, $nParChars, $nSepChars) = @ARGV; | |
print "A" x ($nParChars // 1), "\n" x ($nSepChars // 2) for 1 .. ($nPars // 10); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# -*- coding: utf-8 -*- | |
use strict; | |
use warnings FATAL => qw/all/; | |
my ($nPars, $nLineChars, $nLines, $nSepChars) = @ARGV; | |
for (1 .. ($nPars // 10)) { | |
print "A" x ($nLineChars // 1), "\n" for 1 .. ($nLines // 8); | |
print "\n" x ($nSepChars // 2); | |
} | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* -*- coding: utf-8 -*- *) | |
open Core.Std | |
module M = ParagraphReader | |
let () = | |
if Array.length Sys.argv < 2 then | |
Printf.printf "Usage: %s FILE\n" Sys.argv.(0) | |
else | |
let ic = In_channel.create Sys.argv.(1) in | |
let reader = M.create ~debug:0 ~readsize:4096 ic in | |
let npars = ref 0 in | |
Gc.full_major (); | |
while Option.is_some (M.read reader) do | |
incr npars | |
done; | |
Printf.printf "%d\n" !npars |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* -*- coding: utf-8 -*- *) | |
open Core.Std | |
module ParaReader = struct | |
let rec iter_until ~f ic = | |
match In_channel.input_line ic with | |
| None -> None | |
| Some line -> | |
match f line with | |
| None -> iter_until ~f ic | |
| Some v -> Some v | |
let nonempty_line line = | |
if line = "" then None else Some line | |
let read ic = | |
match iter_until ~f:nonempty_line ic with | |
| None -> None | |
| Some line -> | |
let rec loop ic res = | |
match In_channel.input_line ic with | |
| None -> Some (String.concat ~sep:"\n" (List.rev res)) | |
| Some line -> | |
if line = "" then | |
Some (String.concat ~sep:"\n" (List.rev res)) | |
else | |
loop ic (line :: res) | |
in | |
loop ic [line] | |
end | |
module M = ParaReader | |
let () = | |
if Array.length Sys.argv < 2 then | |
Printf.printf "Usage: %s FILE\n" Sys.argv.(0) | |
else | |
let ic = In_channel.create Sys.argv.(1) in | |
let npars = ref 0 in | |
while Option.is_some (M.read ic) do | |
incr npars | |
done; | |
Printf.printf "%d\n" !npars |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* -*- coding: utf-8 -*- *) | |
open Core.Std | |
open Optutil | |
(* | |
Paragraph here is text section delimited by \n\n. | |
- [^\n] | |
- \n (?! [^\n]) | |
- \n \n | |
Txt: [^\n] region known not have \n\n | |
Nl : \n known position of \n | |
Unk: unknown region. next search target | |
Gbg: garbage after finish. | |
Each reading loop tries to find '\n' from Unk region. | |
if '\n' found, it represents next Nl' region | |
- Unk starts from pos+1 | |
- Gbg starts from finish | |
*) | |
type t = In_buffer.t * Buffer.t | |
let create ?(readsize=256) ?(debug) ic = | |
(In_buffer.create ~readsize ?debug ic, Buffer.create readsize) | |
let readbuf_contents (readbuf, _) = | |
In_buffer.contents readbuf | |
let outbuf_contents (_, outbuf) = | |
Buffer.contents outbuf | |
let outbuf_length (_, outbuf) = | |
Buffer.length outbuf | |
let emit_outbuf (_, outbuf) = | |
let out = Buffer.contents outbuf in | |
Buffer.clear outbuf; | |
out | |
let lfindi ?(pos=0) ?fin str ~f = | |
let fin = fin // (String.length str) in | |
let rec loop str pos fin = | |
if pos >= fin then | |
None | |
else if f str.[pos] then | |
Some pos | |
else | |
loop str (pos+1) fin | |
in | |
loop str pos fin | |
let find_non_newline (readbuf, _) = | |
let rec lfindi_non_newline pos fin str = | |
if pos >= fin then | |
None | |
else if str.[pos] <> '\n' then | |
Some pos | |
else | |
lfindi_non_newline (pos+1) fin str | |
in | |
let rec loop readbuf = | |
match In_buffer.to_range readbuf with | |
| None -> false | |
| Some (str, pos, fin) -> ( | |
let nlpos = lfindi_non_newline pos fin str in | |
if In_buffer.advance_if ~opt:nlpos readbuf then | |
true | |
else if In_buffer.eof readbuf then | |
false | |
else | |
(In_buffer.set_pos ~pos:fin readbuf; | |
loop readbuf) | |
) | |
in | |
loop readbuf | |
let find_end_of_paragraph ?(pos=0) ~fin str = | |
let rec lfindi_newline pos fin str = | |
if pos >= fin then | |
None | |
else if str.[pos] = '\n' then | |
Some pos | |
else | |
lfindi_newline (pos+1) fin str | |
in | |
let rec loop pos fin str = | |
match lfindi_newline pos fin str with | |
| None -> None | |
| Some nlpos -> | |
(if nlpos+1 < fin && str.[nlpos+1] = '\n' then | |
Some nlpos | |
else | |
loop (nlpos+1) fin str | |
) | |
in | |
loop pos fin str | |
let read t = | |
let rec loop ((readbuf, outbuf) as t) = | |
(* Since find_non_newline is called before here, | |
we can assume we have Some(non newline) or None. | |
*) | |
match In_buffer.to_range readbuf with | |
| None -> | |
outbuf_length t > 0 | |
| Some (str, pos, fin) -> | |
match find_end_of_paragraph ~pos ~fin str with | |
| None -> ( | |
In_buffer.flush ~outbuf readbuf; | |
loop t | |
) | |
| Some eop -> ( | |
In_buffer.flush ~outbuf ~fin:eop ~set_pos:(eop+2) readbuf; | |
true | |
) | |
in | |
if not (find_non_newline t) then | |
None | |
else if (loop t) then | |
Some (emit_outbuf t) | |
else | |
None | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment