Last active
November 12, 2025 10:46
-
-
Save s1037989/de71da25eed16f133ca6ec3fbd15b37c to your computer and use it in GitHub Desktop.
repository software inventory generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| use Mojo::Base -strict, -signatures; | |
| use Mojo::File qw(path); | |
| use Mojo::Collection qw(c); | |
| use Mojo::ByteStream qw(b); | |
| use Mojo::Util qw(getopt trim decamelize); | |
| use constant DEFAULT_THRESHOLD => 0.60; | |
| # ---- CLI ------------------------------------------------------------- | |
| my ($root, $min_depth, $max_depth, $threshold, $show_rest, $only_archy, $quiet); | |
| $threshold = DEFAULT_THRESHOLD; | |
| $min_depth = 0; | |
| $max_depth = 99; | |
| getopt \@ARGV, | |
| 'root|r=s' => \$root, | |
| 'min-depth|m=i' => \$min_depth, | |
| 'max-depth|M=i' => \$max_depth, | |
| 'threshold|t=f' => \$threshold, | |
| 'show-rest!' => \$show_rest, # --show-rest to also print the non-matching siblings | |
| 'only-archives!' => \$only_archy, # --only-archives to limit candidates to (likely) software-ish names | |
| 'quiet|q!' => \$quiet; | |
| die "Usage: $0 --root /path/to/repo [--min-depth N] [--max-depth N] [--threshold 0.60] [--show-rest] [--only-archives]\n" | |
| unless $root && -d $root; | |
| # ---- Helpers --------------------------------------------------------- | |
| # Decide if a filename/dir looks like an archive or extracted archive label | |
| sub looks_softwareish ($name) { | |
| # common archive extensions or extracted dir traits | |
| return 1 if $name =~ /\.(?:tar(?:\.(?:gz|bz2|xz|zst))?|t[gbx]z|zip|7z|rar|deb|rpm|whl|jar|war|msi|exe)$/i; | |
| return 1 if $name =~ /\b(?:dist|release|bin|lib|include|src|setup|pkg|install|app|plugin|module)s?\b/i; | |
| # name-version like patterns (foo-1.2.3, Foo_2024.10, foo-v1, foo-1.2.3-rc1) | |
| return 1 if $name =~ /[-_\.]v?\d+(?:\.\d+){0,3}(?:[-_\.]?(?:rc|beta|alpha|pre|b|a)\d*)?\b/i; | |
| return 0; | |
| } | |
| # Normalize a sibling name for “similarity” grouping: | |
| # - lowercase | |
| # - strip multi-extensions (.tar.gz, .tar.bz2, .zip, etc.) | |
| # - drop typical arch tags (x86_64, amd64, arm64, win64, linux, macos) | |
| # - trim probable version suffixes and build metadata | |
| # - collapse separators/delim noise | |
| sub canonical_label ($name) { | |
| my $base = $name; | |
| # remove container/container+comp extensions | |
| $base =~ s/\.(?:tar\.(?:gz|bz2|xz|zst)|t(?:gz|bz|xz)|zip|7z|rar|deb|rpm|whl|jar|war|msi|exe)\z//i; | |
| # single ext (fallback) | |
| $base =~ s/\.(?:gz|bz2|xz|zst)\z//i; | |
| my $lc = lc $base; | |
| # arch/platform tokens | |
| $lc =~ s/(?:[-_.])?(?:x86_64|amd64|arm64|aarch64|x64|x86|i386|i686|ppc64le|s390x|win(?:32|64)?|linux|macos|darwin|osx|ubuntu|rhel|centos|alpine|debian|fedora|el\d+)\b//g; | |
| # common distro/pkg noise | |
| $lc =~ s/\b(?:release|bin|lib|src|static|shared|debug|symbols|portable|standalone|minimal|full|enterprise|community)\b//g; | |
| # version-ish tails (keep core name): -1.2.3, _v2, .20241010, -rc1, -beta2, +build.45 | |
| $lc =~ s/(?:[-_.])?(?:v?\d+(?:\.\d+){0,4})(?:[-_.]?(?:rc|beta|alpha|pre|b|a)\d*)?(?:\+build[-_.]?\w+)?\b//g; | |
| # dates like 20241010 or 2024-10-10 | |
| $lc =~ s/(?:[-_.])?(?:20\d{2}[-_.]?\d{2}[-_.]?\d{2})\b//g; | |
| # collapse separators and whitespace | |
| $lc =~ s/[^a-z0-9]+/ /g; | |
| $lc = trim $lc; | |
| # if it collapses to nothing, fall back to decamelized original base name | |
| $lc = trim decamelize($base) || $lc; | |
| $lc =~ s/[^a-z0-9]+/ /g; | |
| $lc = trim $lc; | |
| # final guard | |
| return $lc || '∅'; | |
| } | |
| # Return immediate children entries of a directory as Mojo::Collection of Mojo::File | |
| sub children_of ($dir) { | |
| return c(sort { $a->to_string cmp $b->to_string } grep { $_->basename ne '.' && $_->basename ne '..' } $dir->list->each); | |
| } | |
| # Derive a shallow "depth" relative to root | |
| sub depth_of ($root_path, $abs_dir) { | |
| my $rel = path($abs_dir)->to_abs->to_string; | |
| my $base = path($root_path)->to_abs->to_string; | |
| $rel =~ s/^\Q$base\E\/?//; | |
| return 0 if $rel eq '' || $rel eq '.'; | |
| return scalar grep { length } split m{/+}, $rel; | |
| } | |
| # Nice printing helpers | |
| sub say_header ($dir, $label, $ratio, $count, $total) { | |
| my $pct = sprintf '%.0f%%', $ratio * 100; | |
| say b("==> $dir")->cyan->bold, " [$pct similar: '$label' $count/$total]"; | |
| } | |
| sub say_item ($entry, $mark='*') { | |
| my $t = $entry->basename; | |
| say " $mark $t"; | |
| } | |
| # ---- Core scan ------------------------------------------------------- | |
| my $ROOT = path($root)->to_abs; | |
| # Traverse directories up to max_depth; Mojo::File::list_tree supports max_depth. | |
| $ROOT->list_tree({max_depth => $max_depth}) | |
| ->grep(sub ($p) { -d $p }) # only directories | |
| ->each(sub ($dir) { | |
| my $d = depth_of($ROOT, $dir); | |
| return if $d < $min_depth; # not deep enough yet | |
| # inspect immediate children only (avoid contents of packages) | |
| my $kids = children_of($dir); | |
| return unless $kids->size; # empty dir | |
| # Consider “candidate” names: either all, or only the ones that look software-ish | |
| my $consider = $kids->grep(sub ($f) { | |
| my $n = $f->basename; | |
| return looks_softwareish($n) || !$only_archy; | |
| }); | |
| return unless $consider->size; | |
| # Build canonical groups | |
| my %groups; | |
| my %raw_lists; | |
| for my $f ($consider->each) { | |
| my $label = canonical_label($f->basename); | |
| push @{$groups{$label}}, $f; | |
| } | |
| # choose the largest group | |
| my ($best_label, $best_group) = ('', []); | |
| while (my ($label, $files) = each %groups) { | |
| if (@$files > @$best_group) { | |
| ($best_label, $best_group) = ($label, $files); | |
| } | |
| } | |
| my $total = $kids->size; # denominator is whole folder inventory | |
| my $similar = scalar(@$best_group); | |
| my $ratio = $total ? ($similar / $total) : 0; | |
| # Only report “coherent” directories (≥ threshold) | |
| return if $ratio < $threshold; | |
| say_header($dir, $best_label, $ratio, $similar, $total) unless $quiet; | |
| # Print similar ones | |
| for my $f (sort { $a->basename cmp $b->basename } @$best_group) { | |
| say_item($f, '*'); | |
| } | |
| if ($show_rest) { | |
| # show the remaining siblings in this folder (non-matching) | |
| my %in_best = map { $_->to_abs->to_string => 1 } @$best_group; | |
| my @rest = grep { !$in_best{ $_->to_abs->to_string } } $kids->each; | |
| if (@rest) { | |
| say " -- other items --" unless $quiet; | |
| for my $f (sort { $a->basename cmp $b->basename } @rest) { | |
| say_item($f, '-'); | |
| } | |
| } | |
| } | |
| say '' unless $quiet; | |
| }); | |
| __END__ | |
| =pod | |
| =head1 NAME | |
| repo_similar.pl - Find directories whose contents are ≥ N% similarly-named siblings (software archives & extracted packages) | |
| =head1 SYNOPSIS | |
| perl repo_similar.pl --root /repo \ | |
| --min-depth 1 --max-depth 4 \ | |
| --threshold 0.60 \ | |
| --show-rest \ | |
| --only-archives | |
| =head1 OPTIONS | |
| --root|-r PATH Root directory to scan (required) | |
| --min-depth|-m N Minimum depth relative to root (default 0) | |
| --max-depth|-M N Maximum depth relative to root (default 99) | |
| --threshold|-t F Similarity threshold (0..1, default 0.60) | |
| --show-rest Also list non-matching siblings in matched dirs | |
| --only-archives Only consider entries that look like archives/extracted | |
| --quiet|-q Suppress headers for scripting | |
| =head1 NOTES | |
| Similarity is based on a normalized label that: | |
| • lowercases | |
| • strips multi-extensions (.tar.gz, .zip, .whl, .jar, .msi, etc.) | |
| • removes arch/platform tokens (x86_64, amd64, arm64, win64, linux, macos…) | |
| • removes version-ish suffixes (v1.2.3, -rc1, 20241010, +build.45) | |
| • collapses separators and trims | |
| The directory is reported when the largest canonical label group, | |
| measured against ALL items in that directory, meets the threshold. | |
| This keeps the report small and avoids digging inside package contents. | |
| =cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment