Skip to content

Instantly share code, notes, and snippets.

@s1037989
Last active November 12, 2025 10:46
Show Gist options
  • Select an option

  • Save s1037989/de71da25eed16f133ca6ec3fbd15b37c to your computer and use it in GitHub Desktop.

Select an option

Save s1037989/de71da25eed16f133ca6ec3fbd15b37c to your computer and use it in GitHub Desktop.
repository software inventory generator
#!/usr/bin/env perl
use Mojo::Base -strict, -signatures;
use Mojo::File qw(path);
use Mojo::Collection qw(c);
use Mojo::ByteStream qw(b);
use Mojo::Util qw(getopt trim decamelize);
use constant DEFAULT_THRESHOLD => 0.60;
# ---- CLI -------------------------------------------------------------
my ($root, $min_depth, $max_depth, $threshold, $show_rest, $only_archy, $quiet);
$threshold = DEFAULT_THRESHOLD;
$min_depth = 0;
$max_depth = 99;
getopt \@ARGV,
'root|r=s' => \$root,
'min-depth|m=i' => \$min_depth,
'max-depth|M=i' => \$max_depth,
'threshold|t=f' => \$threshold,
'show-rest!' => \$show_rest, # --show-rest to also print the non-matching siblings
'only-archives!' => \$only_archy, # --only-archives to limit candidates to (likely) software-ish names
'quiet|q!' => \$quiet;
die "Usage: $0 --root /path/to/repo [--min-depth N] [--max-depth N] [--threshold 0.60] [--show-rest] [--only-archives]\n"
unless $root && -d $root;
# ---- Helpers ---------------------------------------------------------
# Decide if a filename/dir looks like an archive or extracted archive label
sub looks_softwareish ($name) {
# common archive extensions or extracted dir traits
return 1 if $name =~ /\.(?:tar(?:\.(?:gz|bz2|xz|zst))?|t[gbx]z|zip|7z|rar|deb|rpm|whl|jar|war|msi|exe)$/i;
return 1 if $name =~ /\b(?:dist|release|bin|lib|include|src|setup|pkg|install|app|plugin|module)s?\b/i;
# name-version like patterns (foo-1.2.3, Foo_2024.10, foo-v1, foo-1.2.3-rc1)
return 1 if $name =~ /[-_\.]v?\d+(?:\.\d+){0,3}(?:[-_\.]?(?:rc|beta|alpha|pre|b|a)\d*)?\b/i;
return 0;
}
# Normalize a sibling name for “similarity” grouping:
# - lowercase
# - strip multi-extensions (.tar.gz, .tar.bz2, .zip, etc.)
# - drop typical arch tags (x86_64, amd64, arm64, win64, linux, macos)
# - trim probable version suffixes and build metadata
# - collapse separators/delim noise
sub canonical_label ($name) {
my $base = $name;
# remove container/container+comp extensions
$base =~ s/\.(?:tar\.(?:gz|bz2|xz|zst)|t(?:gz|bz|xz)|zip|7z|rar|deb|rpm|whl|jar|war|msi|exe)\z//i;
# single ext (fallback)
$base =~ s/\.(?:gz|bz2|xz|zst)\z//i;
my $lc = lc $base;
# arch/platform tokens
$lc =~ s/(?:[-_.])?(?:x86_64|amd64|arm64|aarch64|x64|x86|i386|i686|ppc64le|s390x|win(?:32|64)?|linux|macos|darwin|osx|ubuntu|rhel|centos|alpine|debian|fedora|el\d+)\b//g;
# common distro/pkg noise
$lc =~ s/\b(?:release|bin|lib|src|static|shared|debug|symbols|portable|standalone|minimal|full|enterprise|community)\b//g;
# version-ish tails (keep core name): -1.2.3, _v2, .20241010, -rc1, -beta2, +build.45
$lc =~ s/(?:[-_.])?(?:v?\d+(?:\.\d+){0,4})(?:[-_.]?(?:rc|beta|alpha|pre|b|a)\d*)?(?:\+build[-_.]?\w+)?\b//g;
# dates like 20241010 or 2024-10-10
$lc =~ s/(?:[-_.])?(?:20\d{2}[-_.]?\d{2}[-_.]?\d{2})\b//g;
# collapse separators and whitespace
$lc =~ s/[^a-z0-9]+/ /g;
$lc = trim $lc;
# if it collapses to nothing, fall back to decamelized original base name
$lc = trim decamelize($base) || $lc;
$lc =~ s/[^a-z0-9]+/ /g;
$lc = trim $lc;
# final guard
return $lc || '∅';
}
# Return immediate children entries of a directory as Mojo::Collection of Mojo::File
sub children_of ($dir) {
return c(sort { $a->to_string cmp $b->to_string } grep { $_->basename ne '.' && $_->basename ne '..' } $dir->list->each);
}
# Derive a shallow "depth" relative to root
sub depth_of ($root_path, $abs_dir) {
my $rel = path($abs_dir)->to_abs->to_string;
my $base = path($root_path)->to_abs->to_string;
$rel =~ s/^\Q$base\E\/?//;
return 0 if $rel eq '' || $rel eq '.';
return scalar grep { length } split m{/+}, $rel;
}
# Nice printing helpers
sub say_header ($dir, $label, $ratio, $count, $total) {
my $pct = sprintf '%.0f%%', $ratio * 100;
say b("==> $dir")->cyan->bold, " [$pct similar: '$label' $count/$total]";
}
sub say_item ($entry, $mark='*') {
my $t = $entry->basename;
say " $mark $t";
}
# ---- Core scan -------------------------------------------------------
my $ROOT = path($root)->to_abs;
# Traverse directories up to max_depth; Mojo::File::list_tree supports max_depth.
$ROOT->list_tree({max_depth => $max_depth})
->grep(sub ($p) { -d $p }) # only directories
->each(sub ($dir) {
my $d = depth_of($ROOT, $dir);
return if $d < $min_depth; # not deep enough yet
# inspect immediate children only (avoid contents of packages)
my $kids = children_of($dir);
return unless $kids->size; # empty dir
# Consider “candidate” names: either all, or only the ones that look software-ish
my $consider = $kids->grep(sub ($f) {
my $n = $f->basename;
return looks_softwareish($n) || !$only_archy;
});
return unless $consider->size;
# Build canonical groups
my %groups;
my %raw_lists;
for my $f ($consider->each) {
my $label = canonical_label($f->basename);
push @{$groups{$label}}, $f;
}
# choose the largest group
my ($best_label, $best_group) = ('', []);
while (my ($label, $files) = each %groups) {
if (@$files > @$best_group) {
($best_label, $best_group) = ($label, $files);
}
}
my $total = $kids->size; # denominator is whole folder inventory
my $similar = scalar(@$best_group);
my $ratio = $total ? ($similar / $total) : 0;
# Only report “coherent” directories (≥ threshold)
return if $ratio < $threshold;
say_header($dir, $best_label, $ratio, $similar, $total) unless $quiet;
# Print similar ones
for my $f (sort { $a->basename cmp $b->basename } @$best_group) {
say_item($f, '*');
}
if ($show_rest) {
# show the remaining siblings in this folder (non-matching)
my %in_best = map { $_->to_abs->to_string => 1 } @$best_group;
my @rest = grep { !$in_best{ $_->to_abs->to_string } } $kids->each;
if (@rest) {
say " -- other items --" unless $quiet;
for my $f (sort { $a->basename cmp $b->basename } @rest) {
say_item($f, '-');
}
}
}
say '' unless $quiet;
});
__END__
=pod
=head1 NAME
repo_similar.pl - Find directories whose contents are ≥ N% similarly-named siblings (software archives & extracted packages)
=head1 SYNOPSIS
perl repo_similar.pl --root /repo \
--min-depth 1 --max-depth 4 \
--threshold 0.60 \
--show-rest \
--only-archives
=head1 OPTIONS
--root|-r PATH Root directory to scan (required)
--min-depth|-m N Minimum depth relative to root (default 0)
--max-depth|-M N Maximum depth relative to root (default 99)
--threshold|-t F Similarity threshold (0..1, default 0.60)
--show-rest Also list non-matching siblings in matched dirs
--only-archives Only consider entries that look like archives/extracted
--quiet|-q Suppress headers for scripting
=head1 NOTES
Similarity is based on a normalized label that:
• lowercases
• strips multi-extensions (.tar.gz, .zip, .whl, .jar, .msi, etc.)
• removes arch/platform tokens (x86_64, amd64, arm64, win64, linux, macos…)
• removes version-ish suffixes (v1.2.3, -rc1, 20241010, +build.45)
• collapses separators and trims
The directory is reported when the largest canonical label group,
measured against ALL items in that directory, meets the threshold.
This keeps the report small and avoids digging inside package contents.
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment