copy · May 20, 2018 18:23
diff --git a/regexredux.ocaml-4.ml b/regexredux.ocaml-4.ml
 (*
 * The Computer Language Benchmarks Game
 * https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
 *
 * regex-dna program contributed by Paolo Ribeca, August 2011
 *
 * The regexp machinery comes from a previous OCaml version by
 *  Christophe TROESTLER and Mauricio Fernandez
 *
 * converted from regex-dna program
 *)

 let workers = 16

 let variants, variants_to_string =
  let variants = [|
    "agggtaaa\\|tttaccct";
    "[cgt]gggtaaa\\|tttaccc[acg]";
    "a[act]ggtaaa\\|tttacc[agt]t";
    "ag[act]gtaaa\\|tttac[agt]ct";
    "agg[act]taaa\\|ttta[agt]cct";
    "aggg[acg]aaa\\|ttt[cgt]ccct";
    "agggt[cgt]aa\\|tt[acg]accct";
    "agggta[cgt]a\\|t[acg]taccct";
    "agggtaa[cgt]\\|[acg]ttaccct"
  |]
  (* Remove the "\\" which is mandatory in OCaml regex *)
  and re_bs = Str.regexp_string "\\" in
  Array.map Str.regexp_case_fold variants,
  Array.map (Str.global_replace re_bs "") variants

 let iupacs =
  Array.map (fun (re, s) -> Str.regexp re, s) [|
    "tHa[Nt]", "<4>";
    "aND\\|caN\\|Ha[DS]\\|WaS", "<3>";
    "a[NSt]\\|BY", "<2>";
    "<[^>]*>", "\\|";
    "\\|[^|][^|]*\\|", "-"
  |]

 (* Read all of a redirected FASTA format file from stdin *)
 let file_data, file_length =
  let b = Buffer.create 0xffffff and s = Bytes.create 0xfff and r = ref 1 in
  while !r > 0 do
    r := input stdin s 0 0xfff; Buffer.add_subbytes b s 0 !r
  done;
  Buffer.contents b, Buffer.length b

 (* Remove FASTA sequence descriptions and all linefeed characters *)
 let dna = Bytes.unsafe_of_string (Str.global_replace (Str.regexp "^>.*$\\|\n") "" file_data)
 let code_length = Bytes.length dna

 (* Count matches of [re] *)
 let count re s =
  let i = ref 0 and n = ref 0 in
  try
    while true do
      i := 1 + Str.search_forward re (Bytes.unsafe_to_string s) !i;
      incr n
    done;
    assert false
  with Not_found -> !n

 let () =
  let chunk_size = code_length / workers
  and rem = code_length mod workers in
  assert (chunk_size >= 7);
  let w = Array.make workers stdin
  and red_workers = workers - 1 in
  for i = 0 to red_workers do
    let delta = if i > 0 then 7 else 0 in
    let lo = i * chunk_size + min i rem - delta in
    let len = chunk_size + (if i < rem then 1 else 0) + delta
    and input, output = Unix.pipe () in
    match Unix.fork () with
    | 0 ->
      Unix.close input;
      let chunk = Bytes.sub dna lo len
      and output = Unix.out_channel_of_descr output in
      (* First all the regexps... *)
      Array.iter
        (fun re -> output_binary_int output (count re chunk))
        variants;
      (* ...and then all the IUPAC replacements *)
      if i > 0 then
        for j = 0 to 6 do
          chunk.[j] <- ' '
        done;
      let b = ref chunk in
      Array.iter
        (fun (re, s) -> b := Bytes.unsafe_of_string (Str.global_replace re s (Bytes.unsafe_to_string !b)))
        iupacs;
      output_binary_int output (Bytes.length !b - delta);
      exit 0
    | _ ->
      Unix.close output;
      w.(i) <- Unix.in_channel_of_descr input
  done;
  let counts_variants = Array.init (Array.length variants) (fun _ -> ref 0)
  and count_iupac = ref 0 in
  Array.iter
    (fun input ->
      Array.iter
        (fun count -> count := !count + input_binary_int input)
        counts_variants;
      count_iupac := !count_iupac + input_binary_int input)
    w;
  Array.iteri
    (fun i re -> Printf.printf "%s %i\n" re !(counts_variants.(i)))
    variants_to_string;
  Printf.printf "\n%i\n%i\n%i\n%!" file_length code_length !count_iupac
	(*
	* The Computer Language Benchmarks Game
	* https://salsa.debian.org/benchmarksgame-team/benchmarksgame/
	*
	* regex-dna program contributed by Paolo Ribeca, August 2011
	*
	* The regexp machinery comes from a previous OCaml version by
	* Christophe TROESTLER and Mauricio Fernandez
	*
	* converted from regex-dna program
	*)

	let workers = 16

	let variants, variants_to_string =
	let variants = [\|
	"agggtaaa\\\|tttaccct";
	"[cgt]gggtaaa\\\|tttaccc[acg]";
	"a[act]ggtaaa\\\|tttacc[agt]t";
	"ag[act]gtaaa\\\|tttac[agt]ct";
	"agg[act]taaa\\\|ttta[agt]cct";
	"aggg[acg]aaa\\\|ttt[cgt]ccct";
	"agggt[cgt]aa\\\|tt[acg]accct";
	"agggta[cgt]a\\\|t[acg]taccct";
	"agggtaa[cgt]\\\|[acg]ttaccct"
	\|]
	(* Remove the "\\" which is mandatory in OCaml regex *)
	and re_bs = Str.regexp_string "\\" in
	Array.map Str.regexp_case_fold variants,
	Array.map (Str.global_replace re_bs "") variants

	let iupacs =
	Array.map (fun (re, s) -> Str.regexp re, s) [\|
	"tHa[Nt]", "<4>";
	"aND\\\|caN\\\|Ha[DS]\\\|WaS", "<3>";
	"a[NSt]\\\|BY", "<2>";
	"<[^>]*>", "\\\|";
	"\\\|[^\|][^\|]*\\\|", "-"
	\|]

	(* Read all of a redirected FASTA format file from stdin *)
	let file_data, file_length =
	let b = Buffer.create 0xffffff and s = Bytes.create 0xfff and r = ref 1 in
	while !r > 0 do
	r := input stdin s 0 0xfff; Buffer.add_subbytes b s 0 !r
	done;
	Buffer.contents b, Buffer.length b

	(* Remove FASTA sequence descriptions and all linefeed characters *)
	let dna = Bytes.unsafe_of_string (Str.global_replace (Str.regexp "^>.*$\\\|\n") "" file_data)
	let code_length = Bytes.length dna

	(* Count matches of [re] *)
	let count re s =
	let i = ref 0 and n = ref 0 in
	try
	while true do
	i := 1 + Str.search_forward re (Bytes.unsafe_to_string s) !i;
	incr n
	done;
	assert false
	with Not_found -> !n

	let () =
	let chunk_size = code_length / workers
	and rem = code_length mod workers in
	assert (chunk_size >= 7);
	let w = Array.make workers stdin
	and red_workers = workers - 1 in
	for i = 0 to red_workers do
	let delta = if i > 0 then 7 else 0 in
	let lo = i * chunk_size + min i rem - delta in
	let len = chunk_size + (if i < rem then 1 else 0) + delta
	and input, output = Unix.pipe () in
	match Unix.fork () with
	\| 0 ->
	Unix.close input;
	let chunk = Bytes.sub dna lo len
	and output = Unix.out_channel_of_descr output in
	(* First all the regexps... *)
	Array.iter
	(fun re -> output_binary_int output (count re chunk))
	variants;
	(* ...and then all the IUPAC replacements *)
	if i > 0 then
	for j = 0 to 6 do
	chunk.[j] <- ' '
	done;
	let b = ref chunk in
	Array.iter
	(fun (re, s) -> b := Bytes.unsafe_of_string (Str.global_replace re s (Bytes.unsafe_to_string !b)))
	iupacs;
	output_binary_int output (Bytes.length !b - delta);
	exit 0
	\| _ ->
	Unix.close output;
	w.(i) <- Unix.in_channel_of_descr input
	done;
	let counts_variants = Array.init (Array.length variants) (fun _ -> ref 0)
	and count_iupac = ref 0 in
	Array.iter
	(fun input ->
	Array.iter
	(fun count -> count := !count + input_binary_int input)
	counts_variants;
	count_iupac := !count_iupac + input_binary_int input)
	w;
	Array.iteri
	(fun i re -> Printf.printf "%s %i\n" re !(counts_variants.(i)))
	variants_to_string;
	Printf.printf "\n%i\n%i\n%i\n%!" file_length code_length !count_iupac