Skip to content

Instantly share code, notes, and snippets.

@object
Created August 25, 2015 07:55
Show Gist options
  • Save object/57b24e0a74ca3fdac7dd to your computer and use it in GitHub Desktop.
Save object/57b24e0a74ca3fdac7dd to your computer and use it in GitHub Desktop.
FindFileDuplicates (sumbission to Kodekampen)
open System
open System.IO
open System.Diagnostics
open System.Collections.Generic
// Generic method to run parallel computations applied to elements of a collection
let runParallel f xs =
Async.Parallel [ for x in xs -> async { return f x } ]
|> Async.RunSynchronously
// The main algorithm to find and list files with identical content
let findFileDuplicates rootPath minSize =
// Used to serialize printing program output from multiple threads
let globalLock = Object()
// Since detailed printing slows down the program significantly, set this flag to "false" to measure raw performance
let shouldPrintFileDetails = true
let fileHeaderLength = 10
// Performs byte by byte comparison of two files
let compareFilesBytes file1 file2 =
let rec compareBytes (fs1 : Stream) (fs2 : Stream) =
let b1 = fs1.ReadByte()
let b2 = fs2.ReadByte()
match b1 with
| -1 -> true
| b when b <> b2 -> false
| _ -> compareBytes fs1 fs2
use fs1 = File.OpenRead(file1)
use fs2 = File.OpenRead(file2)
compareBytes fs1 fs2
// Prints duplicate file group details
let printDuplicatesInFileGroup fileDuplicates =
if shouldPrintFileDetails then
lock globalLock (fun () ->
printfn "\nDuplicate files (size of %d bytes)" (FileInfo(Seq.head fileDuplicates).Length)
for f in fileDuplicates do printfn "\t%s" f)
else
printf "."
// Recursively find duplicates for the given file and returns unmatched files
let rec extractFileDuplicates file filesToCompare fileDuplicates unmatchedFiles =
match filesToCompare with
| [] when (List.length fileDuplicates) > 1 ->
printDuplicatesInFileGroup fileDuplicates
unmatchedFiles
| fileToCompare :: xs ->
let (unmatchedFiles, fileDuplicates) = match compareFilesBytes file fileToCompare with
| true -> (unmatchedFiles, fileToCompare :: fileDuplicates)
| false -> (fileToCompare :: unmatchedFiles, fileDuplicates)
extractFileDuplicates file xs fileDuplicates unmatchedFiles
| _ -> unmatchedFiles
// Recursively compares all file in the given file group
let rec compareFilesInGroup (files : string list) =
match files with
| [] -> ()
| x :: xs ->
let remainingFiles = extractFileDuplicates x xs [x] []
compareFilesInGroup remainingFiles
// Populates all files in the root folder in its subdirectories
let enumerateFiles rootPath =
DirectoryInfo(rootPath).EnumerateDirectories("*.*", SearchOption.TopDirectoryOnly)
|> runParallel (fun x -> DirectoryInfo(x.FullName).EnumerateFiles("*.*", SearchOption.AllDirectories) |> List.ofSeq)
|> Seq.ofArray
|> Seq.concat
// Adds a file to a group by certain criteria
let addFileToGroup filePath groupId (container : Dictionary<int64, List<string>>) =
lock container (fun () ->
match container.TryGetValue(groupId) with
| (true, bucket) -> bucket.Add(filePath)
| _ ->
let bucket = List<string>()
bucket.Add(filePath)
container.Add(groupId, bucket))
()
// Main processing starts
let sw = Stopwatch()
sw.Start()
printfn "Enumerating files..."
let files = enumerateFiles rootPath
|> Seq.filter (fun x -> x.Length >= minSize)
|> Seq.toList
printfn "Found %d files with size >= %d bytes" (List.length files) minSize
let filesBySize = Dictionary<int64, List<string>>()
files |> List.iter (fun x -> addFileToGroup x.FullName x.Length filesBySize)
printfn "Analyzing %d file groups having two or more files of same size" filesBySize.Keys.Count
let runComparison (filesOfSameSize : KeyValuePair<int64,List<string>>) =
match filesOfSameSize.Value.Count with
| 1 -> ()
| n -> compareFilesInGroup (filesOfSameSize.Value |> Seq.toList)
runParallel runComparison filesBySize |> ignore
sw.Stop()
printfn "\nExecution completed, elapsed time %d seconds" (int sw.ElapsedMilliseconds / 1000)
// Main processing ends
let printUsage () =
printfn "Usage: FindFileDuplicates <minSizeInBytes> <rootFolder>"
let printInvalidMinSize () =
printfn "Invalid <minSizeInBytes> argument, must be a positive integer."
let printInvalidRootPath () =
printfn "Invalid <rootFolder> argument, must be an existing directory."
[<EntryPoint>]
let main argv =
match argv with
| [|arg1; arg2|] ->
let arg1ok, minSize = Int64.TryParse(arg1)
let arg2ok, rootPath = Directory.Exists(arg2), arg2
match (arg1ok, arg2ok) with
| (false, _) -> printInvalidMinSize()
| (_, false) -> printInvalidRootPath()
| _ -> findFileDuplicates rootPath minSize
| _ -> printUsage()
0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment