Skip to content

Instantly share code, notes, and snippets.

@spraints
Last active September 27, 2024 13:18
Show Gist options
  • Save spraints/f8694f22ff3d40c73f45b9728714f508 to your computer and use it in GitHub Desktop.
Save spraints/f8694f22ff3d40c73f45b9728714f508 to your computer and use it in GitHub Desktop.
Find which parts of your Git repository are growing the fastest
#/ Usage: ruby recent-growth.rb [OPTIONS]
#/ --git-dir <git_dir>
#/ --since <date>
#/ --until <date>
#/ --max-count <count>
#/ --per tree|extension
#/ --list-all (in the summary, show all trees / extensions, not just the top 10)
#/ --all | --branches | --tags | --remotes | <commit>
#/
#/ Example:
#/ $ git clone https://gist.github.com/f8694f22ff3d40c73f45b9728714f508 repo-growth
#/ $ git clone --mirror https://github.com/me/my-repo
#/ $ cd my-repo
#/ $ ruby ../repo-growth/recent-growth.rb --since 1.week.ago --branches --tags
DEBUG = ENV["DEBUG"].to_s != ""
def main(mode:, git_dir:, filters:, commits:, list_all: false)
git_dir_args = git_dir ? ["-C", git_dir] : []
boundary_cmd = [
"git", *git_dir_args, "rev-list",
"--boundary",
*filters,
*commits,
]
rev_list_cmd = [
"git", *git_dir_args, "rev-list",
"--stdin",
"--objects", "--object-names",
]
cat_file_cmd = [
"git", *git_dir_args, "cat-file",
"--batch-check=%(objectname) %(objecttype) %(objectsize:disk)",
]
if DEBUG
puts "$ #{boundary_cmd.join(" ")} |"
puts " #{rev_list_cmd.join(" ")} |"
puts " #{cat_file_cmd.join(" ")}"
end
IO.popen(boundary_cmd) do |boundary|
r, rev_list_in = IO.pipe
IO.popen(rev_list_cmd, in: r) do |rev_list|
r, cat_file_in = IO.pipe
IO.popen(cat_file_cmd, in: r) do |cat_file|
names = {}
rs = [
boundary,
rev_list,
cat_file,
]
until rs.empty?
ready, = IO.select(rs)
ready.each do |f|
case f
when boundary
begin
# rev-list --boundary outputs all the commits it finds as
# "<oid>" and the boundaries as "-<oid>". We want to feed this
# back into the next rev-list as "<oid>" and "~<oid>" so that
# we can see which objects come from just the new commits.
oid = boundary.readline
rev_list_in.print(oid.tr("-", "^"))
rescue EOFError
rs.delete(boundary)
rev_list_in.close
end
when rev_list
begin
# capture the object name, if any, so we can use it later, and
# send the oid along to cat-file.
oid, name = rev_list.readline.strip.split(" ", 2)
names[oid] = name if name
cat_file_in.puts oid
rescue EOFError
rs.delete(rev_list)
cat_file_in.close
end
when cat_file
begin
# process the output.
oid, type, size = cat_file.readline.strip.split(" ", 3)
mode.add \
name: names.delete(oid),
oid: oid,
type: type,
size: size.to_i
rescue EOFError
rs.delete(cat_file)
end
end
end
end
end
end
end
# show the stats
mode.finish \
list_all: list_all
end
# git rev-list --objects --no-object-names --since-as-filter=1.week.ago | git cat-file --batch-check='%(objectsize:disk)' | perl -lne '$sum += $_ for /-?\d+/g; END { print $sum }'
# next step is to write a script that figures out growth per dir, file extension, object type, etc.
class Total
def initialize
@total_size = 0
end
def add(size:, **)
@total_size += size
end
def finish(list_all:)
puts "Total growth:"
printf " %15d bytes\n", @total_size
end
end
class CountPerType
def initialize
@counts = Hash.new(0)
end
def add(type:, **)
@counts[type] += 1
end
def finish(list_all:)
puts "New objects:"
@counts.sort_by { |type, _| type }.each do |type, count|
printf " %15d %s\n", count, type
end
end
end
class PerTree
def initialize
@toplevel = 0
@subdirs = Hash.new(0)
end
def add(size:, name:, **)
if name.nil?
@toplevel += size
else
top_level_tree, rest = name.split("/", 2)
@subdirs[top_level_tree] += size
end
end
def finish(list_all:)
@subdirs[:"(other)"] = @toplevel
puts "Growth per tree:"
sorted = @subdirs.sort_by { |_, size| -size }
sorted = sorted.take(10) unless list_all
sorted.each do |subdir, size|
printf " %15d bytes - %s\n", size, subdir
end
end
end
class PerExtension
def initialize
@extensions = Hash.new(0)
end
def add(size:, name:, type:, **)
return if name.nil? || type != "blob"
ext = File.extname(name)
ext = :"(none)" if ext == ""
@extensions[ext] += size
end
def finish(list_all:)
puts "Growth per extension:"
sorted = @extensions.sort_by { |_, size| -size }
sorted = sorted.take(10) unless list_all
sorted.each do |ext, size|
printf " %15d bytes - %s\n", size, ext
end
end
end
class PerType
def initialize
@types = Hash.new(0)
end
def add(size:, type:, **)
@types[type] += size
end
def finish(list_all:)
puts "Growth per object type:"
sorted = @types.sort_by { |type, _| type }
sorted = sorted.take(10) unless list_all
sorted.each do |type, size|
printf " %15d bytes - %s\n", size, type
end
end
end
class Multi
def initialize(*collectors)
@collectors = collectors
end
def add(**opts)
@collectors.each do |c|
c.add(**opts)
end
end
def finish(list_all:)
@collectors.each do |c|
c.finish(list_all: list_all)
end
end
end
def usage_and_exit
system "cat #$0 | grep ^#/ | cut -c4-"
exit 1
end
mode = Multi.new \
Total.new,
CountPerType.new,
PerType.new,
PerExtension.new
git_dir = nil
filters = []
commits = []
list_all = false
until ARGV.empty?
case arg = ARGV.shift
when /^--since/, /^--max-count/, /^--until/
filters << arg
filters << ARGV.shift unless arg =~ /=/
when "-n"
filters << arg
filters << ARGV.shift
when "--all", "--branches", "--tags", "--remotes"
commits << arg
when "--git-dir"
git_dir = ARGV.shift
when "--per"
case arg = ARGV.shift
when "tree"
mode = PerTree.new
when /^ext/
mode = PerExtension.new
else
usage_and_exit
end
when "--list-all"
list_all = true
when /^-/
usage_and_exit
else
commits << arg
end
end
commits << "--all" if commits.empty?
main \
mode: mode,
git_dir: git_dir,
filters: filters,
commits: commits,
list_all: list_all
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment