Last active
August 29, 2015 13:56
-
-
Save toburger/9225982 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let buildTrigrams (s : string) = | |
s | |
|> Seq.windowed 3 | |
|> Seq.map (fun s -> System.String s) | |
|> Seq.toArray | |
//buildTrigrams "ABCDEFGHIJK" | |
let hitPercent (s : string) (tg : string array) = | |
let matchCount = | |
tg | |
|> Array.filter s.Contains | |
|> Array.length | |
let totalCount = tg |> Array.length | |
float matchCount / float totalCount | |
//buildTrigrams "this_is_my_data_file" |> hitPercent "this_is_my_data_f" | |
//buildTrigrams "this_is_my_data_file (copy1)" | |
//buildTrigrams "this_is_my_data_file (copy2)" | |
let trigramizeDirectoryRec dir = | |
System.IO.Directory.EnumerateDirectories(dir, "*", System.IO.SearchOption.AllDirectories) | |
|> Seq.map (fun s -> s, System.IO.Path.GetFileName s) | |
|> Seq.map (fun (fp, s) -> (fp, s), buildTrigrams <| System.IO.Path.GetFileName s) | |
|> Seq.toArray | |
let findSimilarFiles minHitPercent trigrams = | |
trigrams | |
|> Array.Parallel.map (fun ((fp, s), _) -> | |
fp, | |
trigrams | |
|> Array.map (fun ((fp', _), tgs) -> fp', tgs |> hitPercent s) | |
|> Array.filter (snd >> (<) minHitPercent) | |
|> Array.filter (fst >> (<>) fp) | |
|> Array.sortBy (snd >> (-) 1.0)) | |
|> Array.filter (snd >> (<>) [||]) | |
let print sfs = | |
for s, sims in sfs do | |
printfn "'%s' is similar to" s | |
for sim, pct in sims do | |
printfn "\t-> '%s' by %.0f%%" sim (pct * 100.) | |
// the enumeration of all files can take a while depending if it is a network | |
// drive or if it is a root folder (although the results are cached after the snd hit) | |
trigramizeDirectoryRec @"D:\Temp" | |
|> findSimilarFiles 0.9 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment