Last active
June 6, 2026 02:16
-
-
Save ernstki/e8ff3e4dd4b205a9c2ee4c48703ea3b6 to your computer and use it in GitHub Desktop.
GNU Awk script to report large directories, N levels deep in a hierarchy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/gawk -E | |
| ## | |
| ## List the top N directories (N levels deep) by cumulative file size | |
| ## | |
| ## Usage: | |
| ## | |
| ## find . -type f -printf "%p\t%s\n" | gawk -E bigdirs [options] | |
| ## | |
| ## # or, if `bigdirs` is in your search path | |
| ## find some/path -type f -printf "%p\t%s\n" | bigdirs [options] | |
| ## | |
| ## # or, if you don't have GNU AWK | |
| ## find . -type f -printf "%p\t%s\n" | awk -f bigdirs -- [options] | |
| ## | |
| ## Author: Kevin Ernst (ernstki -at- mail.uc.edu) | |
| ## Date: 5 June 2026 | |
| ## Assisted by: Kagi Quick (06-2026); gemini-3.1-pro-preview | |
| ## License: Zero-Clause BSD | |
| ## Homepage: https://gist.github.com/ernstki/e8ff3e4dd4b205a9c2ee4c48703ea3b6 | |
| ## | |
| BEGIN { | |
| FS = "\t" | |
| DEFAULT_DEPTH = 3 | |
| DEFAULT_TOP_N = 10 | |
| DEFAULT_HUMANIZE = 0 | |
| HOMEPAGE = "https://gist.github.com/ernstki/e8ff3e4dd4b205a9c2ee4c48703ea3b6" | |
| for (i = 1; i < ARGC; i++) { | |
| if (ARGV[i] == "-d" || ARGV[i] == "--depth") { | |
| depth = ARGV[i+1] + 0 | |
| delete ARGV[i] | |
| delete ARGV[i+1] | |
| i++ | |
| } else if (ARGV[i] ~ /^-d[1-9][0-9]*/) { | |
| depth = substr(ARGV[i], 3) + 0 | |
| delete ARGV[i] | |
| } else if (ARGV[i] ~ /^--depth=[1-9][0-9]*/) { | |
| depth = substr(ARGV[i], 9) + 0 | |
| delete ARGV[i] | |
| } else if (ARGV[i] == "-n" || ARGV[i] == "--top-n") { | |
| top_n = ARGV[i+1] + 0 | |
| delete ARGV[i] | |
| delete ARGV[i+1] | |
| i++ | |
| } else if (ARGV[i] ~ /^-n[1-9][0-9]*/) { | |
| top_n = substr(ARGV[i], 3) + 0 | |
| delete ARGV[i] | |
| } else if (ARGV[i] ~ /^--top-n=[1-9][0-9]*/) { | |
| top_n = substr(ARGV[i], 9) + 0 | |
| delete ARGV[i] | |
| } else if (ARGV[i] == "--skip") { | |
| skip = ARGV[i+1] + 0 | |
| delete ARGV[i] | |
| delete ARGV[i+1] | |
| i++ | |
| } else if (ARGV[i] ~ /^--skip=[1-9][0-9]*/) { | |
| skip = substr(ARGV[i], 8) + 0 | |
| delete ARGV[i] | |
| } else if (ARGV[i] ~ /^(-h|--human.*)/) { | |
| humanize = 1 | |
| delete ARGV[i] | |
| } else if (ARGV[i] ~ /^(-v|--verbose)/) { | |
| verbose = 1 | |
| delete ARGV[i] | |
| } else if (ARGV[i] == "--help") { | |
| print " usage:" | |
| print " awk -f bigdirs -- --help" | |
| print " find . -type f -printf \"%p\\t%s\\n\" | awk -f bigdirs -- [OPTIONS]" | |
| print " # or, if you have GNU Awk" | |
| print " gawk -E bigdirs --help" | |
| print " find . -type f -printf \"%p\\t%s\\n\" | gawk -E bigdirs [OPTIONS]" | |
| print " options:" | |
| print " --help you're looking at it!" | |
| print " --skip N skip this many (header) rows when reading input" | |
| print " -d, --depth N descend up to N subdirs (default: "DEFAULT_DEPTH")" | |
| print " -n, --top-n N show top N dirs by total disk usage (default: "DEFAULT_TOP_N")" | |
| print " -h, --humanize show human-readable sizes? (default: no)" | |
| print " -v, --verbose show more detail while processing (default: no)" | |
| print " ¿hay problemas? report them at:" | |
| print " " HOMEPAGE | |
| exit 0 | |
| } else if (ARGV[i] ~ /^-/) { | |
| print "Unknown option: " ARGV[i] > "/dev/stderr" | |
| exit 1 | |
| } | |
| } | |
| if (!depth) { | |
| depth = DEFAULT_DEPTH | |
| if (verbose) | |
| print "Using default depth of " depth > "/dev/stderr" | |
| } | |
| if (!top_n) { | |
| top_n = DEFAULT_TOP_N | |
| if (verbose) | |
| print "Showing default "top_n" directories by usage" > "/dev/stderr" | |
| } | |
| if (!humanize) { | |
| humanize = DEFAULT_HUMANIZE | |
| if (verbose) | |
| print "Not using human-readable figures; try '--human-readable'" \ | |
| > "/dev/stderr" | |
| } | |
| } | |
| { | |
| if (NR <= skip) next # skip header rows, if --skip given | |
| if (!base) { | |
| if (system("test -d '" $1 "'") == 0 && !warned) { | |
| print "WARNING: Expected input from 'find -type f' (i.e., no " \ | |
| "directories)." > "/dev/stderr" | |
| print "Continuing anyway, but counts will be inaccurate…" \ | |
| > "/dev/stderr" | |
| warned = 1 | |
| next | |
| } | |
| base = $1 # use as base for future L.C.P. comparisons | |
| sub(/[^/]*$/, "", base) # remove the filename part | |
| } else { | |
| base = longestcommonprefix(base, $1) | |
| } | |
| paths[NR] = $1 | |
| sizes[NR] = $2 + 0 | |
| } | |
| END { | |
| if (NR == 0) exit | |
| for (k = 1; k <= NR; k++) { | |
| path = paths[k] | |
| size = sizes[k] | |
| path = trimprefix(base, path) | |
| # split path into components; remember the last part is the filename | |
| n = split(path, parts, "/") - 1 | |
| limit = n < depth ? n : depth | |
| # start off prepending the base path (less a trailing slash) | |
| dir = base | |
| sub(/\/$/, "", dir) | |
| for (i = 1; i <= limit; i++) { | |
| if (dir == "") dir = parts[i] | |
| else dir = dir "/" parts[i] | |
| } | |
| totals[dir] += size | |
| counts[dir]++ | |
| } | |
| # collect directory names into an array for sorting | |
| n = 0 | |
| for (d in totals) { | |
| n++ | |
| dirs[n] = d | |
| } | |
| # selection sort to find top_n by total size (descending) | |
| for (i = 1; i <= n && i <= top_n; i++) { | |
| best = i | |
| for (j = i + 1; j <= n; j++) { | |
| if (totals[dirs[j]] > totals[dirs[best]]) { | |
| best = j | |
| } | |
| } | |
| tmp = dirs[i]; dirs[i] = dirs[best]; dirs[best] = tmp | |
| } | |
| printf "%s\t%s\t%s\n", "directory", "byte_size", "num_files" | |
| for (i = 1; i <= n && i <= top_n; i++) { | |
| d = dirs[i] | |
| printf "%s\t%s\t%d\n", d, \ | |
| humanize ? humansize(totals[d]) : totals[d], counts[d] | |
| } | |
| } | |
| function humansize(b) { | |
| if (b % 1024 == b) return b | |
| else if (b % 1024^2 == b) return sprintf("%0.1fK", b/1024) | |
| else if (b % 1024^3 == b) return sprintf("%0.1fM", b/1024^2) | |
| else if (b % 1024^4 == b) return sprintf("%0.1fG", b/1024^3) | |
| else return sprintf("%0.1fT", b/1024^4) | |
| } | |
| function longestcommonprefix(p1, p2, i, min, prefix) { | |
| if (p1 == "") return p2 | |
| if (p2 == "") return p1 | |
| min = length(p1) < length(p2) ? length(p1) : length(p2) | |
| # find the character-by-character common prefix | |
| for (i = 1; i <= min; i++) { | |
| if (substr(p1, i, 1) != substr(p2, i, 1)) break | |
| } | |
| prefix = substr(p1, 1, i - 1) | |
| # "snap" back to nearest directory (trim non-slash trailing chars) | |
| sub(/[^/]*$/, "", prefix) | |
| return prefix | |
| } | |
| function trimprefix(pref, s) { | |
| if (!s) s = $0 | |
| if (startswith(pref, s)) { | |
| return substr(s, length(pref) + 1) | |
| } else { | |
| print "WARNING: input '"s"' didn't match prefix '"pref"'" \ | |
| > "/dev/stderr" | |
| print "Maybe you should skip the header with '--skip=1'?" \ | |
| > "/dev/stderr" | |
| return s | |
| } | |
| } | |
| function startswith(pref, s) { | |
| if (!s) s = $0 | |
| return substr(s, 1, length(pref)) == pref | |
| } | |
| function endswith(suff, s) { | |
| if (!s) s = $0 | |
| return substr(s, length(s) - length(suff), length(s)) == suff | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment