Last active
August 29, 2015 14:27
-
-
Save mastoj/83b0e0188fcec69cd3fb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System | |
[<AutoOpen>] | |
module FileHelper = | |
type FileCheck = | |
{ | |
FileName: string | |
Size: int64 | |
} | |
let directory dir = new System.IO.DirectoryInfo(dir) | |
let getFileBytes (file:System.IO.FileInfo) = | |
use fileStream = file.OpenRead() | |
let byteArr = Array.init (int file.Length) (fun _ -> new System.Byte()) | |
let readBytes = fileStream.Read(byteArr, 0, (int file.Length)) | |
byteArr | |
let createFileCheck (file:System.IO.FileInfo) = | |
try | |
Some { | |
FileName = file.FullName | |
Size = file.Length | |
} | |
with | |
| :? System.IO.PathTooLongException as e -> | |
printfn "File error: %s" e.Message | |
None | |
let rec getFiles filter (dir:System.IO.DirectoryInfo) = | |
seq { | |
yield! dir.GetFiles() | |
|> Seq.filter filter | |
|> Seq.map createFileCheck | |
|> Seq.filter Option.isSome | |
|> Seq.map (fun (Some f) -> f) | |
yield! dir.GetDirectories() | |
|> Seq.map (getFiles filter) | |
|> Seq.concat | |
} | |
let limitSize limit (file:System.IO.FileInfo) = file.Length < limit | |
[<AutoOpen>] | |
module Parsing = | |
type Result<'T> = | |
| Success of 'T | |
| Fail of string | |
let bind f = function | |
| Success x -> f x | |
| Fail s -> Fail s | |
let (>>=) r f = bind f r | |
let validateArgLenth (argv:string[]) = | |
if argv.Length <> 2 then Fail (sprintf "Expected 2 arguments got %i" argv.Length) | |
else Success argv | |
let parseArg1 (argv:string[]) = | |
let (parseOk, value) = System.Int64.TryParse(argv.[0]) | |
if parseOk then Success value | |
else Fail (sprintf "Invalid limit argument %s" argv.[0]) | |
let parseArg2 (argv:string[]) value = | |
let dir = argv.[1] |> directory | |
if dir.Exists then Success (value, dir) | |
else Fail (sprintf "Invalid directory %s" argv.[1]) | |
let parse argv = | |
argv |> validateArgLenth >>= parseArg1 >>= (parseArg2 argv) | |
[<AutoOpen>] | |
module IOExtensions = | |
// Stuff copied with pride from: http://www.fssnip.net/1k | |
open System.IO | |
type AsyncSeq<'T> = Async<AsyncSeqInner<'T>> | |
and AsyncSeqInner<'T> = | |
| Ended | |
| Item of 'T * AsyncSeq<'T> | |
let readInBlocks fn size = async { | |
let stream = File.OpenRead(fn) | |
let buffer = Array.zeroCreate size | |
let rec nextBlock() = async { | |
let! count = stream.AsyncRead(buffer, 0, size) | |
if count = 0 then return Ended | |
else | |
let res = | |
if count = size then buffer | |
else buffer |> Seq.take count |> Array.ofSeq | |
return Item(res, nextBlock()) } | |
return! nextBlock() } | |
let rec compareAsyncSeqs seq1 seq2 = async { | |
let! item1 = seq1 | |
let! item2 = seq2 | |
match item1, item2 with | |
| Item(b1, ns1), Item(b2, ns2) when b1 <> b2 -> return false | |
| Item(b1, ns1), Item(b2, ns2) -> return! compareAsyncSeqs ns1 ns2 | |
| Ended, Ended -> return true | |
| _ -> return failwith "Size doesn't match" } | |
[<AutoOpen>] | |
module Printing = | |
// Colored printfn | |
let cprintfn c fmt = | |
Printf.kprintf | |
(fun s -> | |
let old = System.Console.ForegroundColor | |
try | |
System.Console.ForegroundColor <- c; | |
System.Console.WriteLine s | |
finally | |
System.Console.ForegroundColor <- old) | |
fmt | |
let printFileCheck fileCheck = | |
cprintfn ConsoleColor.Yellow "\tFile: %s" fileCheck.FileName | |
let print group = | |
let head = group |> Seq.head | |
cprintfn ConsoleColor.Green "File %s has the same content as" head.FileName | |
group |> Seq.skip 1 |> Seq.iter printFileCheck | |
let printResult (startTime:DateTime) result = | |
let resultList = result |> Seq.map (fun x -> x |> Seq.toList) |> Seq.toList | |
let timeFormat (dateTime:DateTime) = dateTime.ToString("HH:mm:ss.fffff") | |
let stopTime = DateTime.Now | |
let duration = stopTime - startTime | |
printfn "============== RESULT ==============" | |
resultList |> Seq.iter print | |
printfn "============== END RESULT ==============" | |
printfn "============== SUMMARY ==============" | |
cprintfn ConsoleColor.Green "Number of groups with duplicates: %i" (resultList |> List.length) | |
cprintfn ConsoleColor.Green "Total number of duplicates: %i" (resultList |> List.sumBy List.length) | |
cprintfn ConsoleColor.Green "Start time: %s" (startTime |> timeFormat) | |
cprintfn ConsoleColor.Green "Stop time: %s" (stopTime |> timeFormat) | |
cprintfn ConsoleColor.Green "Duration: %f s" duration.TotalSeconds | |
printfn "============== END SUMMARY ==============" | |
[<AutoOpen>] | |
module DuplicateSearch = | |
let findDuplicatesInGroup group = | |
let compareFileChecks fc1 fc2 = | |
let s1 = readInBlocks fc1.FileName 1000 | |
let s2 = readInBlocks fc2.FileName 1000 | |
printfn "Comparing '%s' with '%s" fc1.FileName fc2.FileName | |
compareAsyncSeqs s1 s2 |> Async.RunSynchronously | |
let rec find remaining res = | |
match remaining with | |
| [] -> res | |
| _ -> | |
let candidate = remaining |> Seq.head | |
let tailSet = remaining |> Seq.skip 1 |> Set.ofSeq | |
let dups = tailSet |> Seq.filter (compareFileChecks candidate) |> Seq.toList | |
match dups with | |
| [] -> find (remaining |> List.tail) res | |
| xs -> | |
let remaining' = tailSet - (dups |> Set.ofList) |> Seq.toList | |
find remaining' ((candidate::dups)::res) | |
find (group |> Seq.toList) [] | |
let groupFilesBySize = Seq.groupBy (fun s -> s.Size) | |
let removeKeyFromGroups = Seq.map (fun (k, g) -> g) | |
let findDuplicatesInGroups = Seq.map findDuplicatesInGroup | |
let joinDuplicateGroups = Seq.concat | |
let findDuplicates sizeLimit dir = | |
dir | |
|> getFiles (limitSize sizeLimit) | |
|> groupFilesBySize | |
|> removeKeyFromGroups | |
|> findDuplicatesInGroups | |
|> joinDuplicateGroups | |
[<EntryPoint>] | |
let main argv = | |
let parseResult = parse argv | |
let startTime = System.DateTime.Now | |
match parseResult with | |
| Fail str -> printfn "%s" str | |
| Success (limit, dir) -> findDuplicates limit dir |> printResult startTime | |
0 // return an integer exit code |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment