Created
August 25, 2015 07:55
-
-
Save object/57b24e0a74ca3fdac7dd to your computer and use it in GitHub Desktop.
FindFileDuplicates (sumbission to Kodekampen)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System | |
open System.IO | |
open System.Diagnostics | |
open System.Collections.Generic | |
// Generic method to run parallel computations applied to elements of a collection | |
let runParallel f xs = | |
Async.Parallel [ for x in xs -> async { return f x } ] | |
|> Async.RunSynchronously | |
// The main algorithm to find and list files with identical content | |
let findFileDuplicates rootPath minSize = | |
// Used to serialize printing program output from multiple threads | |
let globalLock = Object() | |
// Since detailed printing slows down the program significantly, set this flag to "false" to measure raw performance | |
let shouldPrintFileDetails = true | |
let fileHeaderLength = 10 | |
// Performs byte by byte comparison of two files | |
let compareFilesBytes file1 file2 = | |
let rec compareBytes (fs1 : Stream) (fs2 : Stream) = | |
let b1 = fs1.ReadByte() | |
let b2 = fs2.ReadByte() | |
match b1 with | |
| -1 -> true | |
| b when b <> b2 -> false | |
| _ -> compareBytes fs1 fs2 | |
use fs1 = File.OpenRead(file1) | |
use fs2 = File.OpenRead(file2) | |
compareBytes fs1 fs2 | |
// Prints duplicate file group details | |
let printDuplicatesInFileGroup fileDuplicates = | |
if shouldPrintFileDetails then | |
lock globalLock (fun () -> | |
printfn "\nDuplicate files (size of %d bytes)" (FileInfo(Seq.head fileDuplicates).Length) | |
for f in fileDuplicates do printfn "\t%s" f) | |
else | |
printf "." | |
// Recursively find duplicates for the given file and returns unmatched files | |
let rec extractFileDuplicates file filesToCompare fileDuplicates unmatchedFiles = | |
match filesToCompare with | |
| [] when (List.length fileDuplicates) > 1 -> | |
printDuplicatesInFileGroup fileDuplicates | |
unmatchedFiles | |
| fileToCompare :: xs -> | |
let (unmatchedFiles, fileDuplicates) = match compareFilesBytes file fileToCompare with | |
| true -> (unmatchedFiles, fileToCompare :: fileDuplicates) | |
| false -> (fileToCompare :: unmatchedFiles, fileDuplicates) | |
extractFileDuplicates file xs fileDuplicates unmatchedFiles | |
| _ -> unmatchedFiles | |
// Recursively compares all file in the given file group | |
let rec compareFilesInGroup (files : string list) = | |
match files with | |
| [] -> () | |
| x :: xs -> | |
let remainingFiles = extractFileDuplicates x xs [x] [] | |
compareFilesInGroup remainingFiles | |
// Populates all files in the root folder in its subdirectories | |
let enumerateFiles rootPath = | |
DirectoryInfo(rootPath).EnumerateDirectories("*.*", SearchOption.TopDirectoryOnly) | |
|> runParallel (fun x -> DirectoryInfo(x.FullName).EnumerateFiles("*.*", SearchOption.AllDirectories) |> List.ofSeq) | |
|> Seq.ofArray | |
|> Seq.concat | |
// Adds a file to a group by certain criteria | |
let addFileToGroup filePath groupId (container : Dictionary<int64, List<string>>) = | |
lock container (fun () -> | |
match container.TryGetValue(groupId) with | |
| (true, bucket) -> bucket.Add(filePath) | |
| _ -> | |
let bucket = List<string>() | |
bucket.Add(filePath) | |
container.Add(groupId, bucket)) | |
() | |
// Main processing starts | |
let sw = Stopwatch() | |
sw.Start() | |
printfn "Enumerating files..." | |
let files = enumerateFiles rootPath | |
|> Seq.filter (fun x -> x.Length >= minSize) | |
|> Seq.toList | |
printfn "Found %d files with size >= %d bytes" (List.length files) minSize | |
let filesBySize = Dictionary<int64, List<string>>() | |
files |> List.iter (fun x -> addFileToGroup x.FullName x.Length filesBySize) | |
printfn "Analyzing %d file groups having two or more files of same size" filesBySize.Keys.Count | |
let runComparison (filesOfSameSize : KeyValuePair<int64,List<string>>) = | |
match filesOfSameSize.Value.Count with | |
| 1 -> () | |
| n -> compareFilesInGroup (filesOfSameSize.Value |> Seq.toList) | |
runParallel runComparison filesBySize |> ignore | |
sw.Stop() | |
printfn "\nExecution completed, elapsed time %d seconds" (int sw.ElapsedMilliseconds / 1000) | |
// Main processing ends | |
let printUsage () = | |
printfn "Usage: FindFileDuplicates <minSizeInBytes> <rootFolder>" | |
let printInvalidMinSize () = | |
printfn "Invalid <minSizeInBytes> argument, must be a positive integer." | |
let printInvalidRootPath () = | |
printfn "Invalid <rootFolder> argument, must be an existing directory." | |
[<EntryPoint>] | |
let main argv = | |
match argv with | |
| [|arg1; arg2|] -> | |
let arg1ok, minSize = Int64.TryParse(arg1) | |
let arg2ok, rootPath = Directory.Exists(arg2), arg2 | |
match (arg1ok, arg2ok) with | |
| (false, _) -> printInvalidMinSize() | |
| (_, false) -> printInvalidRootPath() | |
| _ -> findFileDuplicates rootPath minSize | |
| _ -> printUsage() | |
0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment