Skip to content

Instantly share code, notes, and snippets.

@mrange
Last active August 29, 2015 14:16
Show Gist options
  • Save mrange/d24e345ddd71f44e1db6 to your computer and use it in GitHub Desktop.
Save mrange/d24e345ddd71f44e1db6 to your computer and use it in GitHub Desktop.
F# top 100 biggest binaries in a GIT repo and print the Commit they were introduced
// ----------------------------------------------------------------------------------------------
// Copyright (c) Mårten Rånge.
// ----------------------------------------------------------------------------------------------
// This source code is subject to terms and conditions of the Microsoft Public License. A
// copy of the license can be found in the License.html file at the root of this distribution.
// If you cannot locate the Microsoft Public License, please send an email to
// [email protected]. By using this source code in any fashion, you are agreeing to be bound
// by the terms of the Microsoft Public License.
// ----------------------------------------------------------------------------------------------
// You must not remove this notice, or any other, from this software.
// ----------------------------------------------------------------------------------------------
open LibGit2Sharp
open System.Text
open System.Linq
module FsGit =
type GitObject =
| GitBlob of Blob
| GitTree of Tree
| GitCommit of Commit
override x.ToString () =
match x with
| GitBlob blob -> sprintf "GitBlob (%A, %A)" blob.Sha blob.Size
| GitTree tree -> sprintf "GitTree (%A, %A)" tree.Sha tree.Count
| GitCommit commit -> sprintf "GitCommit (%A, %A)" commit.Sha commit.MessageShort
let getObjects (odb : ObjectDatabase) : seq<GitObject> =
seq {
for gobj in odb do
yield
match gobj with
| :? Blob as blob -> GitBlob blob
| :? Tree as tree -> GitTree tree
| :? Commit as commit -> GitCommit commit
| _ -> failwithf "Unrecognized obj: %A, %A" gobj.Sha <| gobj.GetType().Name
}
let chooseBlob = function GitBlob x -> Some x | _ -> None
let chooseTree = function GitTree x -> Some x | _ -> None
let chooseCommit = function GitCommit x -> Some x | _ -> None
// TODO: Rewrite to idiomatic F#
let enumerateCommits (commits : _ []) =
let result = System.Collections.Generic.Dictionary<string, Commit*int> ()
let mutable next = ResizeArray<_>(result.Count)
for (parents : Commit [], commit : Commit) as v in commits do
if parents.Length = 0 then
result.Add (commit.Sha, (commit,result.Count))
else
next.Add v
while next.Count > 0 do
let current = next
next <- ResizeArray<_>(current.Count)
for (parents : Commit [], commit : Commit) as v in current do
if parents |> Array.forall (fun parent -> not (result.ContainsKey parent.Sha)) then
next.Add v
else
result.Add (commit.Sha, (commit,result.Count))
result
let findLargestBlobs path n =
printfn "Opening repo: %A" path
use repo = new Repository(path)
let odb = repo.ObjectDatabase
printfn "Reading object db..."
let allObjects =
odb
|> FsGit.getObjects
|> Seq.toArray
printfn "Finding the %d largest blobs" n
let biggestBlobs =
allObjects
|> Seq.choose FsGit.chooseBlob
|> Seq.filter (fun blob -> blob.IsBinary)
|> Seq.sortBy (fun x -> -x.Size)
|> Seq.take n
|> Seq.toArray
printfn "Enumerate commits"
let allCommits =
allObjects
|> Seq.choose FsGit.chooseCommit
|> Seq.map (fun c -> (c.Parents |> Seq.toArray), c)
|> Seq.toArray
let commitSequenceDictionary = enumerateCommits allCommits
printfn "Building tree reference map..."
let toLookup (s : seq<'K*'V>) = s.ToLookup(fst, snd)
let treeEntryReferenceLookup =
allObjects
|> Seq.choose FsGit.chooseTree
|> Seq.map (fun t -> t |> Seq.map (fun te -> te.Target.Sha,(t, te)))
|> Seq.concat
|> toLookup
printfn "Building commit reference map..."
let commitReferenceLookup =
allObjects
|> Seq.choose FsGit.chooseCommit
|> Seq.map (fun c -> c.Tree.Sha, c)
|> toLookup
// lookupPath is a bit tricky because it need to handle
// that an tree/blob may be referenced from multiple trees/commits
let lookupPath sha =
let rec findReferences sha : seq<Commit option*string list>=
let treeEntries =
treeEntryReferenceLookup.[sha]
|> Seq.cache
if treeEntries |> Seq.isEmpty then
let commits =
commitReferenceLookup.[sha]
|> Seq.map (fun commit -> Some commit, [])
|> Seq.cache
commits
else
let f (t : Tree, te : TreeEntry) =
let r = findReferences t.Sha
if r |> Seq.isEmpty then
Seq.singleton (None, [te.Name])
else
let result =
r
|> Seq.map (fun (commit, path) -> commit, te.Name::path)
|> Seq.cache
result
let result =
treeEntries
|> Seq.map f
|> Seq.concat
|> Seq.cache
result
let toPath p : string =
let sb = StringBuilder ()
let rec tp p =
match (p : string list) with
| [] -> ()
| [x] -> ignore <| sb.Append x
| x::xs ->
tp xs
ignore <| sb.Append ('/')
ignore <| sb.Append (x)
tp p
sb.ToString ()
findReferences sha
|> Seq.map (fun (commit, p) ->
match commit with
| Some c ->
let _, seq = commitSequenceDictionary.[c.Sha]
seq , c.Sha, c.MessageShort, toPath p
| _ -> -1, "", "", toPath p
)
|> Seq.distinct
|> Seq.sortBy (fun (seq,_,_,_) -> seq)
|> Seq.toArray
printfn "Producing final result"
biggestBlobs
|> Seq.map (fun blob -> blob, lookupPath blob.Sha)
|> Seq.toArray
[<EntryPoint>]
let main argv =
let path = @"C:\temp\GitHub\libgit2sharp"
let result = findLargestBlobs path 100
for blob, references in result do
printfn "Blob: Size: %d - SHA: %s" blob.Size blob.Sha
if references.Length > 0 then
let seq, commit, message, path = references.[0]
printfn " Commit: %s (%d)\n Message: %s\n Path: %s" commit seq message path
0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment