Skip to content

Instantly share code, notes, and snippets.

@ernstki
Last active June 6, 2026 02:16
Show Gist options
  • Select an option

  • Save ernstki/e8ff3e4dd4b205a9c2ee4c48703ea3b6 to your computer and use it in GitHub Desktop.

Select an option

Save ernstki/e8ff3e4dd4b205a9c2ee4c48703ea3b6 to your computer and use it in GitHub Desktop.
GNU Awk script to report large directories, N levels deep in a hierarchy
#!/usr/bin/gawk -E
##
## List the top N directories (N levels deep) by cumulative file size
##
## Usage:
##
## find . -type f -printf "%p\t%s\n" | gawk -E bigdirs [options]
##
## # or, if `bigdirs` is in your search path
## find some/path -type f -printf "%p\t%s\n" | bigdirs [options]
##
## # or, if you don't have GNU AWK
## find . -type f -printf "%p\t%s\n" | awk -f bigdirs -- [options]
##
## Author: Kevin Ernst (ernstki -at- mail.uc.edu)
## Date: 5 June 2026
## Assisted by: Kagi Quick (06-2026); gemini-3.1-pro-preview
## License: Zero-Clause BSD
## Homepage: https://gist.github.com/ernstki/e8ff3e4dd4b205a9c2ee4c48703ea3b6
##
BEGIN {
FS = "\t"
DEFAULT_DEPTH = 3
DEFAULT_TOP_N = 10
DEFAULT_HUMANIZE = 0
HOMEPAGE = "https://gist.github.com/ernstki/e8ff3e4dd4b205a9c2ee4c48703ea3b6"
for (i = 1; i < ARGC; i++) {
if (ARGV[i] == "-d" || ARGV[i] == "--depth") {
depth = ARGV[i+1] + 0
delete ARGV[i]
delete ARGV[i+1]
i++
} else if (ARGV[i] ~ /^-d[1-9][0-9]*/) {
depth = substr(ARGV[i], 3) + 0
delete ARGV[i]
} else if (ARGV[i] ~ /^--depth=[1-9][0-9]*/) {
depth = substr(ARGV[i], 9) + 0
delete ARGV[i]
} else if (ARGV[i] == "-n" || ARGV[i] == "--top-n") {
top_n = ARGV[i+1] + 0
delete ARGV[i]
delete ARGV[i+1]
i++
} else if (ARGV[i] ~ /^-n[1-9][0-9]*/) {
top_n = substr(ARGV[i], 3) + 0
delete ARGV[i]
} else if (ARGV[i] ~ /^--top-n=[1-9][0-9]*/) {
top_n = substr(ARGV[i], 9) + 0
delete ARGV[i]
} else if (ARGV[i] == "--skip") {
skip = ARGV[i+1] + 0
delete ARGV[i]
delete ARGV[i+1]
i++
} else if (ARGV[i] ~ /^--skip=[1-9][0-9]*/) {
skip = substr(ARGV[i], 8) + 0
delete ARGV[i]
} else if (ARGV[i] ~ /^(-h|--human.*)/) {
humanize = 1
delete ARGV[i]
} else if (ARGV[i] ~ /^(-v|--verbose)/) {
verbose = 1
delete ARGV[i]
} else if (ARGV[i] == "--help") {
print
print " usage:"
print " awk -f bigdirs -- --help"
print " find . -type f -printf \"%p\\t%s\\n\" | awk -f bigdirs -- [OPTIONS]"
print
print " # or, if you have GNU Awk"
print " gawk -E bigdirs --help"
print " find . -type f -printf \"%p\\t%s\\n\" | gawk -E bigdirs [OPTIONS]"
print
print " options:"
print " --help you're looking at it!"
print " --skip N skip this many (header) rows when reading input"
print " -d, --depth N descend up to N subdirs (default: "DEFAULT_DEPTH")"
print " -n, --top-n N show top N dirs by total disk usage (default: "DEFAULT_TOP_N")"
print " -h, --humanize show human-readable sizes? (default: no)"
print " -v, --verbose show more detail while processing (default: no)"
print
print " ¿hay problemas? report them at:"
print " " HOMEPAGE
print
exit 0
} else if (ARGV[i] ~ /^-/) {
print "Unknown option: " ARGV[i] > "/dev/stderr"
exit 1
}
}
if (!depth) {
depth = DEFAULT_DEPTH
if (verbose)
print "Using default depth of " depth > "/dev/stderr"
}
if (!top_n) {
top_n = DEFAULT_TOP_N
if (verbose)
print "Showing default "top_n" directories by usage" > "/dev/stderr"
}
if (!humanize) {
humanize = DEFAULT_HUMANIZE
if (verbose)
print "Not using human-readable figures; try '--human-readable'" \
> "/dev/stderr"
}
}
{
if (NR <= skip) next # skip header rows, if --skip given
if (!base) {
if (system("test -d '" $1 "'") == 0 && !warned) {
print "WARNING: Expected input from 'find -type f' (i.e., no " \
"directories)." > "/dev/stderr"
print "Continuing anyway, but counts will be inaccurate…" \
> "/dev/stderr"
warned = 1
next
}
base = $1 # use as base for future L.C.P. comparisons
sub(/[^/]*$/, "", base) # remove the filename part
} else {
base = longestcommonprefix(base, $1)
}
paths[NR] = $1
sizes[NR] = $2 + 0
}
END {
if (NR == 0) exit
for (k = 1; k <= NR; k++) {
path = paths[k]
size = sizes[k]
path = trimprefix(base, path)
# split path into components; remember the last part is the filename
n = split(path, parts, "/") - 1
limit = n < depth ? n : depth
# start off prepending the base path (less a trailing slash)
dir = base
sub(/\/$/, "", dir)
for (i = 1; i <= limit; i++) {
if (dir == "") dir = parts[i]
else dir = dir "/" parts[i]
}
totals[dir] += size
counts[dir]++
}
# collect directory names into an array for sorting
n = 0
for (d in totals) {
n++
dirs[n] = d
}
# selection sort to find top_n by total size (descending)
for (i = 1; i <= n && i <= top_n; i++) {
best = i
for (j = i + 1; j <= n; j++) {
if (totals[dirs[j]] > totals[dirs[best]]) {
best = j
}
}
tmp = dirs[i]; dirs[i] = dirs[best]; dirs[best] = tmp
}
printf "%s\t%s\t%s\n", "directory", "byte_size", "num_files"
for (i = 1; i <= n && i <= top_n; i++) {
d = dirs[i]
printf "%s\t%s\t%d\n", d, \
humanize ? humansize(totals[d]) : totals[d], counts[d]
}
}
function humansize(b) {
if (b % 1024 == b) return b
else if (b % 1024^2 == b) return sprintf("%0.1fK", b/1024)
else if (b % 1024^3 == b) return sprintf("%0.1fM", b/1024^2)
else if (b % 1024^4 == b) return sprintf("%0.1fG", b/1024^3)
else return sprintf("%0.1fT", b/1024^4)
}
function longestcommonprefix(p1, p2, i, min, prefix) {
if (p1 == "") return p2
if (p2 == "") return p1
min = length(p1) < length(p2) ? length(p1) : length(p2)
# find the character-by-character common prefix
for (i = 1; i <= min; i++) {
if (substr(p1, i, 1) != substr(p2, i, 1)) break
}
prefix = substr(p1, 1, i - 1)
# "snap" back to nearest directory (trim non-slash trailing chars)
sub(/[^/]*$/, "", prefix)
return prefix
}
function trimprefix(pref, s) {
if (!s) s = $0
if (startswith(pref, s)) {
return substr(s, length(pref) + 1)
} else {
print "WARNING: input '"s"' didn't match prefix '"pref"'" \
> "/dev/stderr"
print "Maybe you should skip the header with '--skip=1'?" \
> "/dev/stderr"
return s
}
}
function startswith(pref, s) {
if (!s) s = $0
return substr(s, 1, length(pref)) == pref
}
function endswith(suff, s) {
if (!s) s = $0
return substr(s, length(s) - length(suff), length(s)) == suff
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment