Created
August 24, 2015 08:33
-
-
Save bjartwolf/d4d83eda3c2f18bf3ce7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System | |
open System.IO | |
type FileWithLength = {Path: string; | |
Length: int64} | |
type FileWithLengthAndStreamBuffer = {FileWithLength: FileWithLength; | |
Str: System.IO.FileStream; | |
Buf: byte[]} | |
// Ignore long files, they are annoying on Windows | |
let listfiles path = | |
let files = try Directory.GetFiles(path) | |
with _ -> printfn "Access denied to %A" path | |
Array.empty | |
files |> Array.filter (fun f -> f.Length < 259) |> Array.map (fun f -> {Path = f; | |
Length = (new FileInfo(f)).Length}) | |
let filterBySize minSize (files: FileWithLength []) = | |
files |> Array.filter (fun f -> f.Length >= minSize) | |
// Ignore dirs we can't read too | |
let rec getAllFiles path minSize = seq { | |
yield! listfiles path |> filterBySize minSize | |
let subdirs = try Directory.GetDirectories(path) | |
with _ -> printfn "Access denied to %A" path | |
Array.empty | |
for subdir in subdirs do | |
yield! getAllFiles subdir minSize | |
} | |
let bufferlength = 4096// should be small to avoid objects on loh? | |
let readToBuffer files = | |
for s in files do | |
for f in s do | |
// check later if at end of files, all have same length so better check only once | |
f.Str.Read(f.Buf,0,bufferlength) |> ignore | |
let rec splitOpenFilesOfEqualLengthByContentStreaming files = | |
readToBuffer files | |
let groupedByBufferContent = files |> List.map (fun s -> s |> Seq.groupBy (fun f -> f.Buf)) | |
|> Seq.concat |> Seq.toList | |
|> List.map (snd) |> List.map (Seq.toList) | |
|> List.filter (fun x -> x.Length > 1) | |
if groupedByBufferContent.IsEmpty then | |
[] | |
else | |
let oneFile = groupedByBufferContent |> List.head |> List.head | |
if oneFile.Str.Position = oneFile.Str.Length then | |
groupedByBufferContent | |
else | |
splitOpenFilesOfEqualLengthByContentStreaming groupedByBufferContent | |
// ignores files we can't open for read because they are locked or something. | |
let splitFilesOfEqualLengthByContentStreaming (files:FileWithLength list):FileWithLength list list = | |
let disposeStreams = Seq.iter (fun l -> l |> Seq.iter(fun f -> f.Str.Dispose())) | |
let pickInnerValue = List.map (fun s -> s |> List.map (fun f -> f.FileWithLength)) | |
let openStreams = [files |> List.map (fun f -> try Some {FileWithLength = f | |
Str = new FileStream(f.Path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite) | |
Buf = Array.zeroCreate<byte> bufferlength } | |
with _ -> None ) |> List.choose id ] | |
let returnVal = splitOpenFilesOfEqualLengthByContentStreaming openStreams | |
|> pickInnerValue | |
|> List.filter (fun x -> x.Length > 1) | |
openStreams |> disposeStreams | |
returnVal | |
let findFilesOfSameSize path minSize = | |
getAllFiles path minSize | |
|> Seq.groupBy (fun f -> f.Length) | |
|> Seq.map (snd) | |
|> Seq.filter (fun f -> f |> Seq.length > 1) | |
let findEqualFiles path minSize = | |
findFilesOfSameSize path minSize | |
|> Seq.map (Seq.toList) | |
|> Seq.map (splitFilesOfEqualLengthByContentStreaming) | |
|> Seq.concat | |
let prettyPrintStream (input: FileWithLength list seq) = | |
for equalFiles in input do | |
let filename = (equalFiles |> Seq.head).Length | |
printfn "\r\n" | |
printfn "Fillength: %A" filename | |
printfn "******************" | |
for files in equalFiles do | |
printfn "%A" files.Path | |
() | |
[<EntryPoint>] | |
let main argv = | |
try | |
if not (argv.Length = 2) then failwith "First argument is min filesize in bytes and second is path" | |
let couldParse, minBytes = Int64.TryParse(argv.[0]) | |
if (not couldParse) then failwith "First argument is min filesize to scan" | |
let dir = new IO.DirectoryInfo(argv.[1]) | |
if not (dir.Exists) then failwith (sprintf "Directory %s does not exist" dir.FullName) | |
prettyPrintStream (findEqualFiles dir.FullName minBytes) | |
Console.WriteLine("DONE") | |
Console.ReadKey() |> ignore | |
0 // return an integer exit code | |
with ex -> | |
printfn "%A" ex | |
1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment