Created
May 20, 2018 18:23
-
-
Save copy/45e413312966288884aee839b022caf5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* | |
* The Computer Language Benchmarks Game | |
* https://salsa.debian.org/benchmarksgame-team/benchmarksgame/ | |
* | |
* regex-dna program contributed by Paolo Ribeca, August 2011 | |
* | |
* The regexp machinery comes from a previous OCaml version by | |
* Christophe TROESTLER and Mauricio Fernandez | |
* | |
* converted from regex-dna program | |
*) | |
let workers = 16 | |
let variants, variants_to_string = | |
let variants = [| | |
"agggtaaa\\|tttaccct"; | |
"[cgt]gggtaaa\\|tttaccc[acg]"; | |
"a[act]ggtaaa\\|tttacc[agt]t"; | |
"ag[act]gtaaa\\|tttac[agt]ct"; | |
"agg[act]taaa\\|ttta[agt]cct"; | |
"aggg[acg]aaa\\|ttt[cgt]ccct"; | |
"agggt[cgt]aa\\|tt[acg]accct"; | |
"agggta[cgt]a\\|t[acg]taccct"; | |
"agggtaa[cgt]\\|[acg]ttaccct" | |
|] | |
(* Remove the "\\" which is mandatory in OCaml regex *) | |
and re_bs = Str.regexp_string "\\" in | |
Array.map Str.regexp_case_fold variants, | |
Array.map (Str.global_replace re_bs "") variants | |
let iupacs = | |
Array.map (fun (re, s) -> Str.regexp re, s) [| | |
"tHa[Nt]", "<4>"; | |
"aND\\|caN\\|Ha[DS]\\|WaS", "<3>"; | |
"a[NSt]\\|BY", "<2>"; | |
"<[^>]*>", "\\|"; | |
"\\|[^|][^|]*\\|", "-" | |
|] | |
(* Read all of a redirected FASTA format file from stdin *) | |
let file_data, file_length = | |
let b = Buffer.create 0xffffff and s = Bytes.create 0xfff and r = ref 1 in | |
while !r > 0 do | |
r := input stdin s 0 0xfff; Buffer.add_subbytes b s 0 !r | |
done; | |
Buffer.contents b, Buffer.length b | |
(* Remove FASTA sequence descriptions and all linefeed characters *) | |
let dna = Bytes.unsafe_of_string (Str.global_replace (Str.regexp "^>.*$\\|\n") "" file_data) | |
let code_length = Bytes.length dna | |
(* Count matches of [re] *) | |
let count re s = | |
let i = ref 0 and n = ref 0 in | |
try | |
while true do | |
i := 1 + Str.search_forward re (Bytes.unsafe_to_string s) !i; | |
incr n | |
done; | |
assert false | |
with Not_found -> !n | |
let () = | |
let chunk_size = code_length / workers | |
and rem = code_length mod workers in | |
assert (chunk_size >= 7); | |
let w = Array.make workers stdin | |
and red_workers = workers - 1 in | |
for i = 0 to red_workers do | |
let delta = if i > 0 then 7 else 0 in | |
let lo = i * chunk_size + min i rem - delta in | |
let len = chunk_size + (if i < rem then 1 else 0) + delta | |
and input, output = Unix.pipe () in | |
match Unix.fork () with | |
| 0 -> | |
Unix.close input; | |
let chunk = Bytes.sub dna lo len | |
and output = Unix.out_channel_of_descr output in | |
(* First all the regexps... *) | |
Array.iter | |
(fun re -> output_binary_int output (count re chunk)) | |
variants; | |
(* ...and then all the IUPAC replacements *) | |
if i > 0 then | |
for j = 0 to 6 do | |
chunk.[j] <- ' ' | |
done; | |
let b = ref chunk in | |
Array.iter | |
(fun (re, s) -> b := Bytes.unsafe_of_string (Str.global_replace re s (Bytes.unsafe_to_string !b))) | |
iupacs; | |
output_binary_int output (Bytes.length !b - delta); | |
exit 0 | |
| _ -> | |
Unix.close output; | |
w.(i) <- Unix.in_channel_of_descr input | |
done; | |
let counts_variants = Array.init (Array.length variants) (fun _ -> ref 0) | |
and count_iupac = ref 0 in | |
Array.iter | |
(fun input -> | |
Array.iter | |
(fun count -> count := !count + input_binary_int input) | |
counts_variants; | |
count_iupac := !count_iupac + input_binary_int input) | |
w; | |
Array.iteri | |
(fun i re -> Printf.printf "%s %i\n" re !(counts_variants.(i))) | |
variants_to_string; | |
Printf.printf "\n%i\n%i\n%i\n%!" file_length code_length !count_iupac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment