Skip to content

Instantly share code, notes, and snippets.

@lucidjargon
Created March 17, 2012 17:52
Show Gist options
  • Save lucidjargon/2063465 to your computer and use it in GitHub Desktop.
Save lucidjargon/2063465 to your computer and use it in GitHub Desktop.
Simple minhash in F#
let third (a,b,c) = c
let fst3 (a,b,c) = a
let split op c (data: 'a []) =
data.[1..data.Length - 1] |> Array.fold (fun (bset, curTransform, i) curbit ->
if i = c then
bset |> Set.add (hash curTransform), curbit, 1
else bset, op curTransform curbit, i + 1) (Set.empty , data.[0], 1)
let minhash vset = vset |> Set.minElement
let rec minhashes ci k l s =
match ci with
| i when i >= k || Set.count s = 0 -> l
| i -> let minim = minhash s
minhashes (i + 1) k (Set.add minim l) (Set.remove minim s)
let charArr (s:string) = s.ToCharArray() |> Array.map string
let arr = [ "The cat in the hat" ; "The fat cat wears many hats"; "race car" ; "fast car"; "race horse" ]
let sass = arr |> List.map (charArr >> (split (+) 2) >> fst3 >> (minhashes 0 5 Set.empty))
sass |> (rank (arr|>List.toArray) 0 []) |> List.sortBy third
@lucidjargon
Copy link
Author

val it : (string * string * int) list =
[("fast car", "race horse", 0);
("The fat cat wears many hats", "race horse", 0);
("The fat cat wears many hats", "race car", 0);
("The cat in the hat", "race horse", 0);
("The cat in the hat", "fast car", 0);
("The cat in the hat", "race car", 0); ("race car", "fast car", 1);
("The fat cat wears many hats", "fast car", 1);
("race car", "race horse", 2);
("The cat in the hat", "The fat cat wears many hats", 2)]

@lucidjargon
Copy link
Author

Works on any data type. Text, image, audio...

files |> Array.Parallel.map ( File.ReadAllBytes >> (split (^^^) 4096) >> fst3 >> (minhashes 0 10 Set.empty))

@lucidjargon
Copy link
Author

("C:\Users\sir.deenicus\Downloads\Ch13.5-ConditionalRandomFields.pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 6);
("C:\Users\sir.deenicus\Downloads\AGI-book(7-Mar-2012).pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 6);
("C:\Users\sir.deenicus\Downloads\10.1.1.64.3559.pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 6);
("C:\Users\sir.deenicus\Downloads\10.1.1.116.4959.pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 6);
("C:\Users\sir.deenicus\Downloads\ftml_book.pdf",
"C:\Users\sir.deenicus\Downloads\weighted majority.pdf", 5);
("C:\Users\sir.deenicus\Downloads\ftml_book.pdf",
"C:\Users\sir.deenicus\Downloads\networks-book.pdf", 5);
("C:\Users\sir.deenicus\Downloads\ftml.pdf",
"C:\Users\sir.deenicus\Downloads\networks-book.pdf", 5);
("C:\Users\sir.deenicus\Downloads\ftml.pdf",
"C:\Users\sir.deenicus\Downloads\ftml_book.pdf", 5);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment