Last active
August 29, 2015 14:16
-
-
Save mrange/d24e345ddd71f44e1db6 to your computer and use it in GitHub Desktop.
F# top 100 biggest binaries in a GIT repo and print the Commit they were introduced
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ---------------------------------------------------------------------------------------------- | |
// Copyright (c) Mårten Rånge. | |
// ---------------------------------------------------------------------------------------------- | |
// This source code is subject to terms and conditions of the Microsoft Public License. A | |
// copy of the license can be found in the License.html file at the root of this distribution. | |
// If you cannot locate the Microsoft Public License, please send an email to | |
// [email protected]. By using this source code in any fashion, you are agreeing to be bound | |
// by the terms of the Microsoft Public License. | |
// ---------------------------------------------------------------------------------------------- | |
// You must not remove this notice, or any other, from this software. | |
// ---------------------------------------------------------------------------------------------- | |
open LibGit2Sharp | |
open System.Text | |
open System.Linq | |
module FsGit = | |
type GitObject = | |
| GitBlob of Blob | |
| GitTree of Tree | |
| GitCommit of Commit | |
override x.ToString () = | |
match x with | |
| GitBlob blob -> sprintf "GitBlob (%A, %A)" blob.Sha blob.Size | |
| GitTree tree -> sprintf "GitTree (%A, %A)" tree.Sha tree.Count | |
| GitCommit commit -> sprintf "GitCommit (%A, %A)" commit.Sha commit.MessageShort | |
let getObjects (odb : ObjectDatabase) : seq<GitObject> = | |
seq { | |
for gobj in odb do | |
yield | |
match gobj with | |
| :? Blob as blob -> GitBlob blob | |
| :? Tree as tree -> GitTree tree | |
| :? Commit as commit -> GitCommit commit | |
| _ -> failwithf "Unrecognized obj: %A, %A" gobj.Sha <| gobj.GetType().Name | |
} | |
let chooseBlob = function GitBlob x -> Some x | _ -> None | |
let chooseTree = function GitTree x -> Some x | _ -> None | |
let chooseCommit = function GitCommit x -> Some x | _ -> None | |
// TODO: Rewrite to idiomatic F# | |
let enumerateCommits (commits : _ []) = | |
let result = System.Collections.Generic.Dictionary<string, Commit*int> () | |
let mutable next = ResizeArray<_>(result.Count) | |
for (parents : Commit [], commit : Commit) as v in commits do | |
if parents.Length = 0 then | |
result.Add (commit.Sha, (commit,result.Count)) | |
else | |
next.Add v | |
while next.Count > 0 do | |
let current = next | |
next <- ResizeArray<_>(current.Count) | |
for (parents : Commit [], commit : Commit) as v in current do | |
if parents |> Array.forall (fun parent -> not (result.ContainsKey parent.Sha)) then | |
next.Add v | |
else | |
result.Add (commit.Sha, (commit,result.Count)) | |
result | |
let findLargestBlobs path n = | |
printfn "Opening repo: %A" path | |
use repo = new Repository(path) | |
let odb = repo.ObjectDatabase | |
printfn "Reading object db..." | |
let allObjects = | |
odb | |
|> FsGit.getObjects | |
|> Seq.toArray | |
printfn "Finding the %d largest blobs" n | |
let biggestBlobs = | |
allObjects | |
|> Seq.choose FsGit.chooseBlob | |
|> Seq.filter (fun blob -> blob.IsBinary) | |
|> Seq.sortBy (fun x -> -x.Size) | |
|> Seq.take n | |
|> Seq.toArray | |
printfn "Enumerate commits" | |
let allCommits = | |
allObjects | |
|> Seq.choose FsGit.chooseCommit | |
|> Seq.map (fun c -> (c.Parents |> Seq.toArray), c) | |
|> Seq.toArray | |
let commitSequenceDictionary = enumerateCommits allCommits | |
printfn "Building tree reference map..." | |
let toLookup (s : seq<'K*'V>) = s.ToLookup(fst, snd) | |
let treeEntryReferenceLookup = | |
allObjects | |
|> Seq.choose FsGit.chooseTree | |
|> Seq.map (fun t -> t |> Seq.map (fun te -> te.Target.Sha,(t, te))) | |
|> Seq.concat | |
|> toLookup | |
printfn "Building commit reference map..." | |
let commitReferenceLookup = | |
allObjects | |
|> Seq.choose FsGit.chooseCommit | |
|> Seq.map (fun c -> c.Tree.Sha, c) | |
|> toLookup | |
// lookupPath is a bit tricky because it need to handle | |
// that an tree/blob may be referenced from multiple trees/commits | |
let lookupPath sha = | |
let rec findReferences sha : seq<Commit option*string list>= | |
let treeEntries = | |
treeEntryReferenceLookup.[sha] | |
|> Seq.cache | |
if treeEntries |> Seq.isEmpty then | |
let commits = | |
commitReferenceLookup.[sha] | |
|> Seq.map (fun commit -> Some commit, []) | |
|> Seq.cache | |
commits | |
else | |
let f (t : Tree, te : TreeEntry) = | |
let r = findReferences t.Sha | |
if r |> Seq.isEmpty then | |
Seq.singleton (None, [te.Name]) | |
else | |
let result = | |
r | |
|> Seq.map (fun (commit, path) -> commit, te.Name::path) | |
|> Seq.cache | |
result | |
let result = | |
treeEntries | |
|> Seq.map f | |
|> Seq.concat | |
|> Seq.cache | |
result | |
let toPath p : string = | |
let sb = StringBuilder () | |
let rec tp p = | |
match (p : string list) with | |
| [] -> () | |
| [x] -> ignore <| sb.Append x | |
| x::xs -> | |
tp xs | |
ignore <| sb.Append ('/') | |
ignore <| sb.Append (x) | |
tp p | |
sb.ToString () | |
findReferences sha | |
|> Seq.map (fun (commit, p) -> | |
match commit with | |
| Some c -> | |
let _, seq = commitSequenceDictionary.[c.Sha] | |
seq , c.Sha, c.MessageShort, toPath p | |
| _ -> -1, "", "", toPath p | |
) | |
|> Seq.distinct | |
|> Seq.sortBy (fun (seq,_,_,_) -> seq) | |
|> Seq.toArray | |
printfn "Producing final result" | |
biggestBlobs | |
|> Seq.map (fun blob -> blob, lookupPath blob.Sha) | |
|> Seq.toArray | |
[<EntryPoint>] | |
let main argv = | |
let path = @"C:\temp\GitHub\libgit2sharp" | |
let result = findLargestBlobs path 100 | |
for blob, references in result do | |
printfn "Blob: Size: %d - SHA: %s" blob.Size blob.Sha | |
if references.Length > 0 then | |
let seq, commit, message, path = references.[0] | |
printfn " Commit: %s (%d)\n Message: %s\n Path: %s" commit seq message path | |
0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment