Skip to content

Instantly share code, notes, and snippets.

@alexfazio
Last active October 1, 2025 16:26
Show Gist options
  • Select an option

  • Save alexfazio/274fb08ad6266eefe9d329df2be80425 to your computer and use it in GitHub Desktop.

Select an option

Save alexfazio/274fb08ad6266eefe9d329df2be80425 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
#
# count_tokens.sh - Count tokens in developer files using Claude API
#
# Usage: ./count_tokens.sh [file1.py] [file2.js] [folder/] ...
#
# Arguments:
# - Individual files (any text-based developer file)
# - Directories (recursively finds code, config, markup, and docs)
# - Mix of files and directories
#
# Supported file types:
# Code: .py, .js, .ts, .go, .rs, .java, .c, .cpp, .sh, .rb, .php, and 40+ more
# Config: .yaml, .json, .toml, .ini, .env, Dockerfile, Makefile, etc.
# Markup: .md, .rst, .html, .xml, .txt
# Styles: .css, .scss, .sass, .less
#
# Environment Variables:
# ANTHROPIC_API_KEY - Required API key for authentication (loaded from .env)
set -euo pipefail
# Load environment variables from .env file
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ENV_FILE="${SCRIPT_DIR}/.env"
if [ -f "$ENV_FILE" ]; then
set -a
# shellcheck disable=SC1090
source "$ENV_FILE"
set +a
fi
# Constants
readonly API_URL="https://api.anthropic.com/v1/messages/count_tokens"
readonly API_VERSION="2023-06-01"
readonly MODEL="claude-sonnet-4-5-20250929"
readonly CONTEXT_200K=200000
readonly CONTEXT_1M=1000000
# Colors for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly CYAN='\033[0;36m'
readonly BOLD='\033[1m'
readonly NC='\033[0m' # No Color
# Check for required dependencies
check_dependencies() {
local missing_deps=()
if ! command -v curl &> /dev/null; then
missing_deps+=("curl")
fi
if ! command -v jq &> /dev/null; then
missing_deps+=("jq")
fi
if [ ${#missing_deps[@]} -ne 0 ]; then
echo -e "${RED}Error: Missing required dependencies: ${missing_deps[*]}${NC}" >&2
echo "Please install them and try again." >&2
exit 1
fi
}
# Check for API key
check_api_key() {
if [ -z "${ANTHROPIC_API_KEY:-}" ]; then
echo -e "${RED}Error: ANTHROPIC_API_KEY environment variable is not set${NC}" >&2
echo "Please add your API key to ${ENV_FILE}" >&2
exit 1
fi
}
# Find all developer files in a directory recursively
# Includes: code, config, markup, and documentation files
# Args: directory path
# Returns: list of absolute paths to matching files (one per line)
find_developer_files() {
local dir="$1"
if [ ! -d "$dir" ]; then
echo -e "${RED}Error: Not a directory: $dir${NC}" >&2
return 1
fi
# Find all developer files by extension
# Organized by category for maintainability
find "$dir" -type f \( \
-name "*.py" -o -name "*.pyw" -o -name "*.pyi" -o \
-name "*.js" -o -name "*.mjs" -o -name "*.cjs" -o \
-name "*.ts" -o -name "*.tsx" -o -name "*.jsx" -o \
-name "*.go" -o -name "*.rs" -o -name "*.java" -o \
-name "*.c" -o -name "*.cpp" -o -name "*.cc" -o -name "*.cxx" -o \
-name "*.h" -o -name "*.hpp" -o -name "*.hh" -o -name "*.hxx" -o \
-name "*.cs" -o -name "*.sh" -o -name "*.bash" -o -name "*.zsh" -o \
-name "*.rb" -o -name "*.php" -o -name "*.swift" -o -name "*.kt" -o \
-name "*.scala" -o -name "*.clj" -o -name "*.ex" -o -name "*.exs" -o \
-name "*.erl" -o -name "*.hrl" -o -name "*.hs" -o -name "*.elm" -o \
-name "*.ml" -o -name "*.fs" -o -name "*.r" -o -name "*.lua" -o \
-name "*.pl" -o -name "*.pm" -o -name "*.tcl" -o -name "*.vim" -o \
-name "*.md" -o -name "*.markdown" -o -name "*.rst" -o -name "*.txt" -o \
-name "*.yaml" -o -name "*.yml" -o -name "*.json" -o -name "*.jsonc" -o \
-name "*.xml" -o -name "*.toml" -o -name "*.ini" -o -name "*.conf" -o \
-name "*.cfg" -o -name "*.config" -o -name "*.properties" -o \
-name "*.html" -o -name "*.htm" -o -name "*.css" -o -name "*.scss" -o \
-name "*.sass" -o -name "*.less" -o -name "*.vue" -o -name "*.svelte" -o \
-name "*.sql" -o -name "*.dockerfile" -o -name "Dockerfile*" -o \
-name "Makefile*" -o -name "*.mk" -o -name ".gitignore" -o \
-name ".dockerignore" -o -name ".editorconfig" -o -name "*.env*" \
\) | sort
}
# Count tokens for a single file
# Returns token count to stdout only
count_tokens() {
local file="$1"
# Check if file exists and is readable
if [ ! -f "$file" ]; then
echo -e "${RED}Error: File not found: $file${NC}" >&2
return 1
fi
if [ ! -r "$file" ]; then
echo -e "${RED}Error: File not readable: $file${NC}" >&2
return 1
fi
# Read file content and escape for JSON
local content
content=$(jq -Rs . < "$file")
# Build JSON request
local request
request=$(jq -n \
--arg model "$MODEL" \
--argjson content "$content" \
'{
model: $model,
messages: [
{
role: "user",
content: $content
}
]
}')
# Make API request
local response
response=$(curl -s -w "\n%{http_code}" \
-X POST "$API_URL" \
-H "x-api-key: $ANTHROPIC_API_KEY" \
-H "anthropic-version: $API_VERSION" \
-H "content-type: application/json" \
-d "$request")
# Extract HTTP status code and body
local http_code
http_code=$(echo "$response" | tail -n1)
local body
body=$(echo "$response" | sed '$d')
# Check for errors
if [ "$http_code" != "200" ]; then
echo -e "${RED}Error: API request failed with status $http_code${NC}" >&2
echo "$body" | jq -r '.error.message // .' >&2
return 1
fi
# Extract and return token count
local token_count
token_count=$(echo "$body" | jq -r '.input_tokens')
echo "$token_count"
}
# Print table separator
print_separator() {
local width="$1"
printf '%*s\n' "$width" '' | tr ' ' '─'
}
# Calculate percentages and warning for tokens
# Args: tokens
# Returns: "pct_200k pct_1m warning" (space-separated)
calculate_stats() {
local tokens=$1
local pct_200k pct_1m
pct_200k=$(awk "BEGIN {printf \"%.2f\", ($tokens / $CONTEXT_200K) * 100}")
pct_1m=$(awk "BEGIN {printf \"%.2f\", ($tokens / $CONTEXT_1M) * 100}")
local warning=""
if awk "BEGIN {exit !($pct_200k > 25)}"; then
warning="! 200K"
fi
if awk "BEGIN {exit !($pct_1m > 25)}"; then
if [ -n "$warning" ]; then
warning="! Both"
else
warning="! 1M"
fi
fi
echo "$pct_200k $pct_1m $warning"
}
# Calculate relative path from base to target
# Args: base_path target_path
# Returns: relative path
get_relative_path() {
local base="$1"
local target="$2"
local normalized_base
# Normalize paths (remove trailing slashes, resolve to absolute)
normalized_base=$(cd "$base" 2>/dev/null && pwd) || normalized_base="$base"
normalized_base="${normalized_base%/}"
# If target starts with base, strip the base
if [[ "$target" == "$normalized_base"* ]]; then
local rel="${target#"$normalized_base"}"
rel="${rel#/}" # Remove leading slash
echo "$rel"
else
# Not a subpath, return full target
echo "$target"
fi
}
# Extract last N meaningful segments from path
# Args: path, num_segments
# Returns: abbreviated path with last N segments
get_last_n_segments() {
local path="$1"
local n="${2:-3}" # Default to 3 segments
# Handle empty path
if [ -z "$path" ]; then
echo "."
return
fi
local IFS='/'
local -a segments=()
read -r -a segments <<< "$path"
local total=${#segments[@]}
# Handle empty segments array
if [ "$total" -eq 0 ]; then
echo "$path"
return
fi
local start=$((total - n))
if [ "$start" -lt 0 ]; then
start=0
fi
local result=""
local i
# Build result by iterating through segments
while [ "$start" -lt "$total" ]; do
local seg="${segments[$start]}"
# Shorten UUIDs in this segment to 8 chars
seg=$(echo "$seg" | sed -E 's/([0-9a-f]{8})-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/\1/g')
if [ -z "$result" ]; then
result="$seg"
else
result="$result/$seg"
fi
start=$((start + 1))
done
echo "$result"
}
# Simplify filename to just show type and extension
# Args: filename
# Returns: simplified filename
simplify_filename() {
local filename="$1"
# Extract extension
local ext="${filename##*.}"
# Check if it's an extraction file
if [[ "$filename" == *"_extraction."* ]]; then
echo "extraction.$ext"
else
# For other files, just return the basename
echo "$filename"
fi
}
# Check if input contains any directories
has_directories() {
local -n input_files=$1
for item in "${input_files[@]}"; do
if [ -d "$item" ]; then
return 0
fi
done
return 1
}
# Display results in tree-style hierarchical format
# Compatible with bash 3.2+ (no associative arrays)
display_tree_table() {
# Use global arrays: files, token_counts, scan_base_path
local total_tokens=$1
local file_count=${#files[@]}
# Print table header
echo "" >&2
if [ -n "$scan_base_path" ]; then
echo -e "${BOLD}${CYAN}Token Count Summary${NC} ${CYAN}(relative to: $scan_base_path)${NC}" >&2
else
echo -e "${BOLD}${CYAN}Token Count Summary${NC}" >&2
fi
print_separator 80 >&2
printf "${BOLD}%-23s %10s %12s %12s %s${NC}\n" \
"File" "Tokens" "200K %" "1M %" "Warning" >&2
print_separator 80 >&2
# Process files grouped by directory
# Files are pre-sorted by path, so same-dir files are adjacent
local current_dir=""
local dir_total=0
local dir_file_count=0
local -a dir_file_data=()
local dir_count=0
local processed_file_count=0
for i in $(seq 0 $((file_count - 1))); do
local file="${files[$i]}"
local tokens="${token_counts[$i]}"
local file_dir
file_dir=$(dirname "$file")
# Check if we've moved to a new directory
if [ "$file_dir" != "$current_dir" ] && [ -n "$current_dir" ]; then
# Print previous directory's data
print_directory_section "$current_dir" "$dir_total" "$dir_file_count" "${dir_file_data[@]}"
((dir_count++))
# Add spacing between directories
if [ "$i" -lt $((file_count - 1)) ]; then
echo "" >&2
fi
# Reset for new directory
dir_total=0
dir_file_count=0
dir_file_data=()
fi
# Update current directory tracking
current_dir="$file_dir"
# Accumulate file data: basename|tokens
local basename
basename=$(basename "$file")
dir_file_data+=("$basename|$tokens")
dir_total=$((dir_total + tokens))
((dir_file_count++))
((processed_file_count++))
done
# Print last directory
if [ -n "$current_dir" ]; then
print_directory_section "$current_dir" "$dir_total" "$dir_file_count" "${dir_file_data[@]}"
((dir_count++))
fi
# Print grand total if multiple directories or multiple files
if [ "$dir_count" -gt 1 ] || [ "$file_count" -gt 1 ]; then
print_separator 80 >&2
local total_stats
total_stats=$(calculate_stats "$total_tokens")
read -r total_pct_200k total_pct_1m total_warning <<<"$total_stats"
if [ -n "$total_warning" ]; then
printf "${BOLD}%-23s ${YELLOW}%10s${NC} %11s%% %11s%% ${YELLOW}%s${NC}\n" \
"GRAND TOTAL" "$total_tokens" "$total_pct_200k" "$total_pct_1m" "$total_warning" >&2
else
printf "${BOLD}%-23s ${YELLOW}%10s${NC} %11s%% %11s%% %s${NC}\n" \
"GRAND TOTAL" "$total_tokens" "$total_pct_200k" "$total_pct_1m" "-" >&2
fi
fi
print_separator 80 >&2
echo "" >&2
}
# Print a directory section with its files
# Args: dir_path dir_total file_count file_data_array...
print_directory_section() {
local dir_path=$1
local dir_total=$2
local num_files=$3
shift 3
local -a file_data=("$@")
# Calculate directory stats
local dir_stats
dir_stats=$(calculate_stats "$dir_total")
read -r dir_pct_200k dir_pct_1m dir_warning <<<"$dir_stats"
# Get last 3 segments of directory path for display
local display_path="$dir_path"
if [ -n "$scan_base_path" ]; then
display_path=$(get_relative_path "$scan_base_path" "$dir_path")
fi
display_path=$(get_last_n_segments "$display_path" 3)
# Print directory header
echo -e "${BOLD}${BLUE}${display_path}/${NC}" >&2
# Print each file
local file_num=0
for file_entry in "${file_data[@]}"; do
((file_num++))
# Parse: basename|tokens
local basename="${file_entry%|*}"
local tokens="${file_entry#*|}"
# Simplify filename for display
local display_filename
display_filename=$(simplify_filename "$basename")
# Calculate file stats
local file_stats
file_stats=$(calculate_stats "$tokens")
read -r pct_200k pct_1m warning <<<"$file_stats"
# Determine tree character
local tree_char="├─"
if [ "$file_num" -eq "$num_files" ]; then
tree_char="└─"
fi
# Print file row with simpler formatting
if [ -n "$warning" ]; then
printf "${GREEN}%s %-20s${NC} ${YELLOW}%10s${NC} %11s%% %11s%% ${YELLOW}%s${NC}\n" \
"$tree_char" "$display_filename" "$tokens" "$pct_200k" "$pct_1m" "$warning" >&2
else
printf "${GREEN}%s %-20s${NC} ${YELLOW}%10s${NC} %11s%% %11s%% %s\n" \
"$tree_char" "$display_filename" "$tokens" "$pct_200k" "$pct_1m" "-" >&2
fi
done
# Print directory subtotal
if [ "$num_files" -gt 1 ]; then
if [ -n "$dir_warning" ]; then
printf "${BOLD} └─ Subtotal%9s ${YELLOW}%10s${NC} %11s%% %11s%% ${YELLOW}%s${NC}\n" \
"" "$dir_total" "$dir_pct_200k" "$dir_pct_1m" "$dir_warning" >&2
else
printf "${BOLD} └─ Subtotal%9s ${YELLOW}%10s${NC} %11s%% %11s%% %s${NC}\n" \
"" "$dir_total" "$dir_pct_200k" "$dir_pct_1m" "-" >&2
fi
fi
}
# Check if GNU parallel is available
has_parallel() {
command -v parallel &> /dev/null && parallel --version 2>&1 | grep -q "GNU parallel"
}
# Process files in parallel using GNU parallel
# Args: array of file paths
process_files_parallel() {
local -a file_list=("$@")
local temp_dir
temp_dir=$(mktemp -d)
local results_file="$temp_dir/results.txt"
# Export necessary functions and variables for parallel
export API_URL API_VERSION MODEL ANTHROPIC_API_KEY
export -f count_tokens
# Create wrapper function for parallel
count_tokens_wrapper() {
local file="$1"
local tokens
if tokens=$(count_tokens "$file" 2>/dev/null); then
echo "$file|$tokens"
fi
}
export -f count_tokens_wrapper
# Run parallel processing (max 6 concurrent jobs to respect API limits)
# Suppress progress bar to avoid ANSI escape code pollution
printf "%s\n" "${file_list[@]}" | \
parallel --will-cite --jobs 6 count_tokens_wrapper {} 2>/dev/null > "$results_file"
# Read results
while IFS='|' read -r file tokens; do
if [ -n "$file" ] && [ -n "$tokens" ]; then
files+=("$file")
token_counts+=("$tokens")
total_tokens=$((total_tokens + tokens))
((successful_files++))
fi
done < "$results_file"
# Sort results by file path while maintaining token_counts correspondence
# This ensures proper directory grouping in display output
if [ "${#files[@]}" -gt 0 ]; then
local -a combined=()
local idx=0
while [ "$idx" -lt "${#files[@]}" ]; do
combined+=("${files[$idx]}|${token_counts[$idx]}")
idx=$((idx + 1))
done
# Sort combined array by file path
local -a sorted=()
while IFS= read -r line; do
sorted+=("$line")
done < <(printf "%s\n" "${combined[@]}" | sort)
# Rebuild arrays from sorted data
files=()
token_counts=()
for entry in "${sorted[@]}"; do
files+=("${entry%|*}")
token_counts+=("${entry#*|}")
done
fi
# Cleanup
rm -rf "$temp_dir"
}
# Main function
main() {
if [ $# -eq 0 ]; then
echo "Usage: $0 [file1.py] [file2.js] [folder/] ..." >&2
echo "" >&2
echo "Count tokens in developer files using Claude API" >&2
echo "Accepts individual files, directories, or a mix of both" >&2
echo "Supports: code, config, markup, style files (50+ extensions)" >&2
exit 1
fi
check_dependencies
check_api_key
# Variables accessible by display functions
files=()
token_counts=()
scan_base_path=""
local total_tokens=0
local successful_files=0
# Determine base path for relative display
# Use first directory argument, or PWD if only files
for arg in "$@"; do
if [ -d "$arg" ]; then
scan_base_path=$(cd "$arg" && pwd)
break
fi
done
if [ -z "$scan_base_path" ]; then
scan_base_path=$(pwd)
fi
# Expand directories to files
local -a all_files=()
for arg in "$@"; do
if [ -d "$arg" ]; then
echo -e "${CYAN}Scanning directory${NC}: $arg" >&2
while IFS= read -r file; do
# Use paths as-is from find (avoids path length issues)
all_files+=("$file")
done < <(find_developer_files "$arg")
elif [ -f "$arg" ]; then
# Use file path as-is
all_files+=("$arg")
else
echo -e "${RED}Warning: Skipping non-existent path: $arg${NC}" >&2
fi
done
# Check if we found any files
if [ ${#all_files[@]} -eq 0 ]; then
echo -e "${RED}Error: No supported files found${NC}" >&2
exit 1
fi
echo -e "${CYAN}Found ${#all_files[@]} file(s) to process${NC}" >&2
# Choose processing method based on availability of GNU parallel
if has_parallel && [ ${#all_files[@]} -gt 3 ]; then
echo -e "${CYAN}Using parallel processing (6 concurrent jobs)${NC}" >&2
echo "" >&2
process_files_parallel "${all_files[@]}"
else
if [ ${#all_files[@]} -gt 3 ] && ! has_parallel; then
echo -e "${YELLOW}Note: Install GNU parallel for faster processing${NC}" >&2
fi
echo "" >&2
# Sequential processing
for file in "${all_files[@]}"; do
echo -e "${BLUE}Processing${NC}: $file" >&2
local file_tokens
if file_tokens=$(count_tokens "$file"); then
files+=("$file")
token_counts+=("$file_tokens")
total_tokens=$((total_tokens + file_tokens))
((successful_files++))
fi
done
fi
# Display results in tree format
if [ $successful_files -gt 0 ]; then
display_tree_table "$total_tokens"
fi
}
main "$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment