Skip to content

Instantly share code, notes, and snippets.

@toburger
Last active August 29, 2015 13:56
Show Gist options
  • Save toburger/9225982 to your computer and use it in GitHub Desktop.
Save toburger/9225982 to your computer and use it in GitHub Desktop.
let buildTrigrams (s : string) =
s
|> Seq.windowed 3
|> Seq.map (fun s -> System.String s)
|> Seq.toArray
//buildTrigrams "ABCDEFGHIJK"
let hitPercent (s : string) (tg : string array) =
let matchCount =
tg
|> Array.filter s.Contains
|> Array.length
let totalCount = tg |> Array.length
float matchCount / float totalCount
//buildTrigrams "this_is_my_data_file" |> hitPercent "this_is_my_data_f"
//buildTrigrams "this_is_my_data_file (copy1)"
//buildTrigrams "this_is_my_data_file (copy2)"
let trigramizeDirectoryRec dir =
System.IO.Directory.EnumerateDirectories(dir, "*", System.IO.SearchOption.AllDirectories)
|> Seq.map (fun s -> s, System.IO.Path.GetFileName s)
|> Seq.map (fun (fp, s) -> (fp, s), buildTrigrams <| System.IO.Path.GetFileName s)
|> Seq.toArray
let findSimilarFiles minHitPercent trigrams =
trigrams
|> Array.Parallel.map (fun ((fp, s), _) ->
fp,
trigrams
|> Array.map (fun ((fp', _), tgs) -> fp', tgs |> hitPercent s)
|> Array.filter (snd >> (<) minHitPercent)
|> Array.filter (fst >> (<>) fp)
|> Array.sortBy (snd >> (-) 1.0))
|> Array.filter (snd >> (<>) [||])
let print sfs =
for s, sims in sfs do
printfn "'%s' is similar to" s
for sim, pct in sims do
printfn "\t-> '%s' by %.0f%%" sim (pct * 100.)
// the enumeration of all files can take a while depending if it is a network
// drive or if it is a root folder (although the results are cached after the snd hit)
trigramizeDirectoryRec @"D:\Temp"
|> findSimilarFiles 0.9
|> print
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment