-
-
Save lucidjargon/2063465 to your computer and use it in GitHub Desktop.
let third (a,b,c) = c | |
let fst3 (a,b,c) = a | |
let split op c (data: 'a []) = | |
data.[1..data.Length - 1] |> Array.fold (fun (bset, curTransform, i) curbit -> | |
if i = c then | |
bset |> Set.add (hash curTransform), curbit, 1 | |
else bset, op curTransform curbit, i + 1) (Set.empty , data.[0], 1) | |
let minhash vset = vset |> Set.minElement | |
let rec minhashes ci k l s = | |
match ci with | |
| i when i >= k || Set.count s = 0 -> l | |
| i -> let minim = minhash s | |
minhashes (i + 1) k (Set.add minim l) (Set.remove minim s) | |
let charArr (s:string) = s.ToCharArray() |> Array.map string | |
let arr = [ "The cat in the hat" ; "The fat cat wears many hats"; "race car" ; "fast car"; "race horse" ] | |
let sass = arr |> List.map (charArr >> (split (+) 2) >> fst3 >> (minhashes 0 5 Set.empty)) | |
sass |> (rank (arr|>List.toArray) 0 []) |> List.sortBy third |
Works on any data type. Text, image, audio...
files |> Array.Parallel.map ( File.ReadAllBytes >> (split (^^^) 4096) >> fst3 >> (minhashes 0 10 Set.empty))
("C:\Users\sir.deenicus\Downloads\Ch13.5-ConditionalRandomFields.pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 6);
("C:\Users\sir.deenicus\Downloads\AGI-book(7-Mar-2012).pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 6);
("C:\Users\sir.deenicus\Downloads\10.1.1.64.3559.pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 6);
("C:\Users\sir.deenicus\Downloads\10.1.1.116.4959.pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 6);
("C:\Users\sir.deenicus\Downloads\ftml_book.pdf",
"C:\Users\sir.deenicus\Downloads\weighted majority.pdf", 5);
("C:\Users\sir.deenicus\Downloads\ftml_book.pdf",
"C:\Users\sir.deenicus\Downloads\networks-book.pdf", 5);
("C:\Users\sir.deenicus\Downloads\ftml.pdf",
"C:\Users\sir.deenicus\Downloads\networks-book.pdf", 5);
("C:\Users\sir.deenicus\Downloads\ftml.pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 5);
val it : (string * string * int) list =
[("fast car", "race horse", 0);
("The fat cat wears many hats", "race horse", 0);
("The fat cat wears many hats", "race car", 0);
("The cat in the hat", "race horse", 0);
("The cat in the hat", "fast car", 0);
("The cat in the hat", "race car", 0); ("race car", "fast car", 1);
("The fat cat wears many hats", "fast car", 1);
("race car", "race horse", 2);
("The cat in the hat", "The fat cat wears many hats", 2)]