Last active
October 1, 2025 16:26
-
-
Save alexfazio/274fb08ad6266eefe9d329df2be80425 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # | |
| # count_tokens.sh - Count tokens in developer files using Claude API | |
| # | |
| # Usage: ./count_tokens.sh [file1.py] [file2.js] [folder/] ... | |
| # | |
| # Arguments: | |
| # - Individual files (any text-based developer file) | |
| # - Directories (recursively finds code, config, markup, and docs) | |
| # - Mix of files and directories | |
| # | |
| # Supported file types: | |
| # Code: .py, .js, .ts, .go, .rs, .java, .c, .cpp, .sh, .rb, .php, and 40+ more | |
| # Config: .yaml, .json, .toml, .ini, .env, Dockerfile, Makefile, etc. | |
| # Markup: .md, .rst, .html, .xml, .txt | |
| # Styles: .css, .scss, .sass, .less | |
| # | |
| # Environment Variables: | |
| # ANTHROPIC_API_KEY - Required API key for authentication (loaded from .env) | |
| set -euo pipefail | |
| # Load environment variables from .env file | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| ENV_FILE="${SCRIPT_DIR}/.env" | |
| if [ -f "$ENV_FILE" ]; then | |
| set -a | |
| # shellcheck disable=SC1090 | |
| source "$ENV_FILE" | |
| set +a | |
| fi | |
| # Constants | |
| readonly API_URL="https://api.anthropic.com/v1/messages/count_tokens" | |
| readonly API_VERSION="2023-06-01" | |
| readonly MODEL="claude-sonnet-4-5-20250929" | |
| readonly CONTEXT_200K=200000 | |
| readonly CONTEXT_1M=1000000 | |
| # Colors for output | |
| readonly RED='\033[0;31m' | |
| readonly GREEN='\033[0;32m' | |
| readonly YELLOW='\033[1;33m' | |
| readonly BLUE='\033[0;34m' | |
| readonly CYAN='\033[0;36m' | |
| readonly BOLD='\033[1m' | |
| readonly NC='\033[0m' # No Color | |
| # Check for required dependencies | |
| check_dependencies() { | |
| local missing_deps=() | |
| if ! command -v curl &> /dev/null; then | |
| missing_deps+=("curl") | |
| fi | |
| if ! command -v jq &> /dev/null; then | |
| missing_deps+=("jq") | |
| fi | |
| if [ ${#missing_deps[@]} -ne 0 ]; then | |
| echo -e "${RED}Error: Missing required dependencies: ${missing_deps[*]}${NC}" >&2 | |
| echo "Please install them and try again." >&2 | |
| exit 1 | |
| fi | |
| } | |
| # Check for API key | |
| check_api_key() { | |
| if [ -z "${ANTHROPIC_API_KEY:-}" ]; then | |
| echo -e "${RED}Error: ANTHROPIC_API_KEY environment variable is not set${NC}" >&2 | |
| echo "Please add your API key to ${ENV_FILE}" >&2 | |
| exit 1 | |
| fi | |
| } | |
| # Find all developer files in a directory recursively | |
| # Includes: code, config, markup, and documentation files | |
| # Args: directory path | |
| # Returns: list of absolute paths to matching files (one per line) | |
| find_developer_files() { | |
| local dir="$1" | |
| if [ ! -d "$dir" ]; then | |
| echo -e "${RED}Error: Not a directory: $dir${NC}" >&2 | |
| return 1 | |
| fi | |
| # Find all developer files by extension | |
| # Organized by category for maintainability | |
| find "$dir" -type f \( \ | |
| -name "*.py" -o -name "*.pyw" -o -name "*.pyi" -o \ | |
| -name "*.js" -o -name "*.mjs" -o -name "*.cjs" -o \ | |
| -name "*.ts" -o -name "*.tsx" -o -name "*.jsx" -o \ | |
| -name "*.go" -o -name "*.rs" -o -name "*.java" -o \ | |
| -name "*.c" -o -name "*.cpp" -o -name "*.cc" -o -name "*.cxx" -o \ | |
| -name "*.h" -o -name "*.hpp" -o -name "*.hh" -o -name "*.hxx" -o \ | |
| -name "*.cs" -o -name "*.sh" -o -name "*.bash" -o -name "*.zsh" -o \ | |
| -name "*.rb" -o -name "*.php" -o -name "*.swift" -o -name "*.kt" -o \ | |
| -name "*.scala" -o -name "*.clj" -o -name "*.ex" -o -name "*.exs" -o \ | |
| -name "*.erl" -o -name "*.hrl" -o -name "*.hs" -o -name "*.elm" -o \ | |
| -name "*.ml" -o -name "*.fs" -o -name "*.r" -o -name "*.lua" -o \ | |
| -name "*.pl" -o -name "*.pm" -o -name "*.tcl" -o -name "*.vim" -o \ | |
| -name "*.md" -o -name "*.markdown" -o -name "*.rst" -o -name "*.txt" -o \ | |
| -name "*.yaml" -o -name "*.yml" -o -name "*.json" -o -name "*.jsonc" -o \ | |
| -name "*.xml" -o -name "*.toml" -o -name "*.ini" -o -name "*.conf" -o \ | |
| -name "*.cfg" -o -name "*.config" -o -name "*.properties" -o \ | |
| -name "*.html" -o -name "*.htm" -o -name "*.css" -o -name "*.scss" -o \ | |
| -name "*.sass" -o -name "*.less" -o -name "*.vue" -o -name "*.svelte" -o \ | |
| -name "*.sql" -o -name "*.dockerfile" -o -name "Dockerfile*" -o \ | |
| -name "Makefile*" -o -name "*.mk" -o -name ".gitignore" -o \ | |
| -name ".dockerignore" -o -name ".editorconfig" -o -name "*.env*" \ | |
| \) | sort | |
| } | |
| # Count tokens for a single file | |
| # Returns token count to stdout only | |
| count_tokens() { | |
| local file="$1" | |
| # Check if file exists and is readable | |
| if [ ! -f "$file" ]; then | |
| echo -e "${RED}Error: File not found: $file${NC}" >&2 | |
| return 1 | |
| fi | |
| if [ ! -r "$file" ]; then | |
| echo -e "${RED}Error: File not readable: $file${NC}" >&2 | |
| return 1 | |
| fi | |
| # Read file content and escape for JSON | |
| local content | |
| content=$(jq -Rs . < "$file") | |
| # Build JSON request | |
| local request | |
| request=$(jq -n \ | |
| --arg model "$MODEL" \ | |
| --argjson content "$content" \ | |
| '{ | |
| model: $model, | |
| messages: [ | |
| { | |
| role: "user", | |
| content: $content | |
| } | |
| ] | |
| }') | |
| # Make API request | |
| local response | |
| response=$(curl -s -w "\n%{http_code}" \ | |
| -X POST "$API_URL" \ | |
| -H "x-api-key: $ANTHROPIC_API_KEY" \ | |
| -H "anthropic-version: $API_VERSION" \ | |
| -H "content-type: application/json" \ | |
| -d "$request") | |
| # Extract HTTP status code and body | |
| local http_code | |
| http_code=$(echo "$response" | tail -n1) | |
| local body | |
| body=$(echo "$response" | sed '$d') | |
| # Check for errors | |
| if [ "$http_code" != "200" ]; then | |
| echo -e "${RED}Error: API request failed with status $http_code${NC}" >&2 | |
| echo "$body" | jq -r '.error.message // .' >&2 | |
| return 1 | |
| fi | |
| # Extract and return token count | |
| local token_count | |
| token_count=$(echo "$body" | jq -r '.input_tokens') | |
| echo "$token_count" | |
| } | |
| # Print table separator | |
| print_separator() { | |
| local width="$1" | |
| printf '%*s\n' "$width" '' | tr ' ' '─' | |
| } | |
| # Calculate percentages and warning for tokens | |
| # Args: tokens | |
| # Returns: "pct_200k pct_1m warning" (space-separated) | |
| calculate_stats() { | |
| local tokens=$1 | |
| local pct_200k pct_1m | |
| pct_200k=$(awk "BEGIN {printf \"%.2f\", ($tokens / $CONTEXT_200K) * 100}") | |
| pct_1m=$(awk "BEGIN {printf \"%.2f\", ($tokens / $CONTEXT_1M) * 100}") | |
| local warning="" | |
| if awk "BEGIN {exit !($pct_200k > 25)}"; then | |
| warning="! 200K" | |
| fi | |
| if awk "BEGIN {exit !($pct_1m > 25)}"; then | |
| if [ -n "$warning" ]; then | |
| warning="! Both" | |
| else | |
| warning="! 1M" | |
| fi | |
| fi | |
| echo "$pct_200k $pct_1m $warning" | |
| } | |
| # Calculate relative path from base to target | |
| # Args: base_path target_path | |
| # Returns: relative path | |
| get_relative_path() { | |
| local base="$1" | |
| local target="$2" | |
| local normalized_base | |
| # Normalize paths (remove trailing slashes, resolve to absolute) | |
| normalized_base=$(cd "$base" 2>/dev/null && pwd) || normalized_base="$base" | |
| normalized_base="${normalized_base%/}" | |
| # If target starts with base, strip the base | |
| if [[ "$target" == "$normalized_base"* ]]; then | |
| local rel="${target#"$normalized_base"}" | |
| rel="${rel#/}" # Remove leading slash | |
| echo "$rel" | |
| else | |
| # Not a subpath, return full target | |
| echo "$target" | |
| fi | |
| } | |
| # Extract last N meaningful segments from path | |
| # Args: path, num_segments | |
| # Returns: abbreviated path with last N segments | |
| get_last_n_segments() { | |
| local path="$1" | |
| local n="${2:-3}" # Default to 3 segments | |
| # Handle empty path | |
| if [ -z "$path" ]; then | |
| echo "." | |
| return | |
| fi | |
| local IFS='/' | |
| local -a segments=() | |
| read -r -a segments <<< "$path" | |
| local total=${#segments[@]} | |
| # Handle empty segments array | |
| if [ "$total" -eq 0 ]; then | |
| echo "$path" | |
| return | |
| fi | |
| local start=$((total - n)) | |
| if [ "$start" -lt 0 ]; then | |
| start=0 | |
| fi | |
| local result="" | |
| local i | |
| # Build result by iterating through segments | |
| while [ "$start" -lt "$total" ]; do | |
| local seg="${segments[$start]}" | |
| # Shorten UUIDs in this segment to 8 chars | |
| seg=$(echo "$seg" | sed -E 's/([0-9a-f]{8})-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/\1/g') | |
| if [ -z "$result" ]; then | |
| result="$seg" | |
| else | |
| result="$result/$seg" | |
| fi | |
| start=$((start + 1)) | |
| done | |
| echo "$result" | |
| } | |
| # Simplify filename to just show type and extension | |
| # Args: filename | |
| # Returns: simplified filename | |
| simplify_filename() { | |
| local filename="$1" | |
| # Extract extension | |
| local ext="${filename##*.}" | |
| # Check if it's an extraction file | |
| if [[ "$filename" == *"_extraction."* ]]; then | |
| echo "extraction.$ext" | |
| else | |
| # For other files, just return the basename | |
| echo "$filename" | |
| fi | |
| } | |
| # Check if input contains any directories | |
| has_directories() { | |
| local -n input_files=$1 | |
| for item in "${input_files[@]}"; do | |
| if [ -d "$item" ]; then | |
| return 0 | |
| fi | |
| done | |
| return 1 | |
| } | |
| # Display results in tree-style hierarchical format | |
| # Compatible with bash 3.2+ (no associative arrays) | |
| display_tree_table() { | |
| # Use global arrays: files, token_counts, scan_base_path | |
| local total_tokens=$1 | |
| local file_count=${#files[@]} | |
| # Print table header | |
| echo "" >&2 | |
| if [ -n "$scan_base_path" ]; then | |
| echo -e "${BOLD}${CYAN}Token Count Summary${NC} ${CYAN}(relative to: $scan_base_path)${NC}" >&2 | |
| else | |
| echo -e "${BOLD}${CYAN}Token Count Summary${NC}" >&2 | |
| fi | |
| print_separator 80 >&2 | |
| printf "${BOLD}%-23s %10s %12s %12s %s${NC}\n" \ | |
| "File" "Tokens" "200K %" "1M %" "Warning" >&2 | |
| print_separator 80 >&2 | |
| # Process files grouped by directory | |
| # Files are pre-sorted by path, so same-dir files are adjacent | |
| local current_dir="" | |
| local dir_total=0 | |
| local dir_file_count=0 | |
| local -a dir_file_data=() | |
| local dir_count=0 | |
| local processed_file_count=0 | |
| for i in $(seq 0 $((file_count - 1))); do | |
| local file="${files[$i]}" | |
| local tokens="${token_counts[$i]}" | |
| local file_dir | |
| file_dir=$(dirname "$file") | |
| # Check if we've moved to a new directory | |
| if [ "$file_dir" != "$current_dir" ] && [ -n "$current_dir" ]; then | |
| # Print previous directory's data | |
| print_directory_section "$current_dir" "$dir_total" "$dir_file_count" "${dir_file_data[@]}" | |
| ((dir_count++)) | |
| # Add spacing between directories | |
| if [ "$i" -lt $((file_count - 1)) ]; then | |
| echo "" >&2 | |
| fi | |
| # Reset for new directory | |
| dir_total=0 | |
| dir_file_count=0 | |
| dir_file_data=() | |
| fi | |
| # Update current directory tracking | |
| current_dir="$file_dir" | |
| # Accumulate file data: basename|tokens | |
| local basename | |
| basename=$(basename "$file") | |
| dir_file_data+=("$basename|$tokens") | |
| dir_total=$((dir_total + tokens)) | |
| ((dir_file_count++)) | |
| ((processed_file_count++)) | |
| done | |
| # Print last directory | |
| if [ -n "$current_dir" ]; then | |
| print_directory_section "$current_dir" "$dir_total" "$dir_file_count" "${dir_file_data[@]}" | |
| ((dir_count++)) | |
| fi | |
| # Print grand total if multiple directories or multiple files | |
| if [ "$dir_count" -gt 1 ] || [ "$file_count" -gt 1 ]; then | |
| print_separator 80 >&2 | |
| local total_stats | |
| total_stats=$(calculate_stats "$total_tokens") | |
| read -r total_pct_200k total_pct_1m total_warning <<<"$total_stats" | |
| if [ -n "$total_warning" ]; then | |
| printf "${BOLD}%-23s ${YELLOW}%10s${NC} %11s%% %11s%% ${YELLOW}%s${NC}\n" \ | |
| "GRAND TOTAL" "$total_tokens" "$total_pct_200k" "$total_pct_1m" "$total_warning" >&2 | |
| else | |
| printf "${BOLD}%-23s ${YELLOW}%10s${NC} %11s%% %11s%% %s${NC}\n" \ | |
| "GRAND TOTAL" "$total_tokens" "$total_pct_200k" "$total_pct_1m" "-" >&2 | |
| fi | |
| fi | |
| print_separator 80 >&2 | |
| echo "" >&2 | |
| } | |
| # Print a directory section with its files | |
| # Args: dir_path dir_total file_count file_data_array... | |
| print_directory_section() { | |
| local dir_path=$1 | |
| local dir_total=$2 | |
| local num_files=$3 | |
| shift 3 | |
| local -a file_data=("$@") | |
| # Calculate directory stats | |
| local dir_stats | |
| dir_stats=$(calculate_stats "$dir_total") | |
| read -r dir_pct_200k dir_pct_1m dir_warning <<<"$dir_stats" | |
| # Get last 3 segments of directory path for display | |
| local display_path="$dir_path" | |
| if [ -n "$scan_base_path" ]; then | |
| display_path=$(get_relative_path "$scan_base_path" "$dir_path") | |
| fi | |
| display_path=$(get_last_n_segments "$display_path" 3) | |
| # Print directory header | |
| echo -e "${BOLD}${BLUE}${display_path}/${NC}" >&2 | |
| # Print each file | |
| local file_num=0 | |
| for file_entry in "${file_data[@]}"; do | |
| ((file_num++)) | |
| # Parse: basename|tokens | |
| local basename="${file_entry%|*}" | |
| local tokens="${file_entry#*|}" | |
| # Simplify filename for display | |
| local display_filename | |
| display_filename=$(simplify_filename "$basename") | |
| # Calculate file stats | |
| local file_stats | |
| file_stats=$(calculate_stats "$tokens") | |
| read -r pct_200k pct_1m warning <<<"$file_stats" | |
| # Determine tree character | |
| local tree_char="├─" | |
| if [ "$file_num" -eq "$num_files" ]; then | |
| tree_char="└─" | |
| fi | |
| # Print file row with simpler formatting | |
| if [ -n "$warning" ]; then | |
| printf "${GREEN}%s %-20s${NC} ${YELLOW}%10s${NC} %11s%% %11s%% ${YELLOW}%s${NC}\n" \ | |
| "$tree_char" "$display_filename" "$tokens" "$pct_200k" "$pct_1m" "$warning" >&2 | |
| else | |
| printf "${GREEN}%s %-20s${NC} ${YELLOW}%10s${NC} %11s%% %11s%% %s\n" \ | |
| "$tree_char" "$display_filename" "$tokens" "$pct_200k" "$pct_1m" "-" >&2 | |
| fi | |
| done | |
| # Print directory subtotal | |
| if [ "$num_files" -gt 1 ]; then | |
| if [ -n "$dir_warning" ]; then | |
| printf "${BOLD} └─ Subtotal%9s ${YELLOW}%10s${NC} %11s%% %11s%% ${YELLOW}%s${NC}\n" \ | |
| "" "$dir_total" "$dir_pct_200k" "$dir_pct_1m" "$dir_warning" >&2 | |
| else | |
| printf "${BOLD} └─ Subtotal%9s ${YELLOW}%10s${NC} %11s%% %11s%% %s${NC}\n" \ | |
| "" "$dir_total" "$dir_pct_200k" "$dir_pct_1m" "-" >&2 | |
| fi | |
| fi | |
| } | |
| # Check if GNU parallel is available | |
| has_parallel() { | |
| command -v parallel &> /dev/null && parallel --version 2>&1 | grep -q "GNU parallel" | |
| } | |
| # Process files in parallel using GNU parallel | |
| # Args: array of file paths | |
| process_files_parallel() { | |
| local -a file_list=("$@") | |
| local temp_dir | |
| temp_dir=$(mktemp -d) | |
| local results_file="$temp_dir/results.txt" | |
| # Export necessary functions and variables for parallel | |
| export API_URL API_VERSION MODEL ANTHROPIC_API_KEY | |
| export -f count_tokens | |
| # Create wrapper function for parallel | |
| count_tokens_wrapper() { | |
| local file="$1" | |
| local tokens | |
| if tokens=$(count_tokens "$file" 2>/dev/null); then | |
| echo "$file|$tokens" | |
| fi | |
| } | |
| export -f count_tokens_wrapper | |
| # Run parallel processing (max 6 concurrent jobs to respect API limits) | |
| # Suppress progress bar to avoid ANSI escape code pollution | |
| printf "%s\n" "${file_list[@]}" | \ | |
| parallel --will-cite --jobs 6 count_tokens_wrapper {} 2>/dev/null > "$results_file" | |
| # Read results | |
| while IFS='|' read -r file tokens; do | |
| if [ -n "$file" ] && [ -n "$tokens" ]; then | |
| files+=("$file") | |
| token_counts+=("$tokens") | |
| total_tokens=$((total_tokens + tokens)) | |
| ((successful_files++)) | |
| fi | |
| done < "$results_file" | |
| # Sort results by file path while maintaining token_counts correspondence | |
| # This ensures proper directory grouping in display output | |
| if [ "${#files[@]}" -gt 0 ]; then | |
| local -a combined=() | |
| local idx=0 | |
| while [ "$idx" -lt "${#files[@]}" ]; do | |
| combined+=("${files[$idx]}|${token_counts[$idx]}") | |
| idx=$((idx + 1)) | |
| done | |
| # Sort combined array by file path | |
| local -a sorted=() | |
| while IFS= read -r line; do | |
| sorted+=("$line") | |
| done < <(printf "%s\n" "${combined[@]}" | sort) | |
| # Rebuild arrays from sorted data | |
| files=() | |
| token_counts=() | |
| for entry in "${sorted[@]}"; do | |
| files+=("${entry%|*}") | |
| token_counts+=("${entry#*|}") | |
| done | |
| fi | |
| # Cleanup | |
| rm -rf "$temp_dir" | |
| } | |
| # Main function | |
| main() { | |
| if [ $# -eq 0 ]; then | |
| echo "Usage: $0 [file1.py] [file2.js] [folder/] ..." >&2 | |
| echo "" >&2 | |
| echo "Count tokens in developer files using Claude API" >&2 | |
| echo "Accepts individual files, directories, or a mix of both" >&2 | |
| echo "Supports: code, config, markup, style files (50+ extensions)" >&2 | |
| exit 1 | |
| fi | |
| check_dependencies | |
| check_api_key | |
| # Variables accessible by display functions | |
| files=() | |
| token_counts=() | |
| scan_base_path="" | |
| local total_tokens=0 | |
| local successful_files=0 | |
| # Determine base path for relative display | |
| # Use first directory argument, or PWD if only files | |
| for arg in "$@"; do | |
| if [ -d "$arg" ]; then | |
| scan_base_path=$(cd "$arg" && pwd) | |
| break | |
| fi | |
| done | |
| if [ -z "$scan_base_path" ]; then | |
| scan_base_path=$(pwd) | |
| fi | |
| # Expand directories to files | |
| local -a all_files=() | |
| for arg in "$@"; do | |
| if [ -d "$arg" ]; then | |
| echo -e "${CYAN}Scanning directory${NC}: $arg" >&2 | |
| while IFS= read -r file; do | |
| # Use paths as-is from find (avoids path length issues) | |
| all_files+=("$file") | |
| done < <(find_developer_files "$arg") | |
| elif [ -f "$arg" ]; then | |
| # Use file path as-is | |
| all_files+=("$arg") | |
| else | |
| echo -e "${RED}Warning: Skipping non-existent path: $arg${NC}" >&2 | |
| fi | |
| done | |
| # Check if we found any files | |
| if [ ${#all_files[@]} -eq 0 ]; then | |
| echo -e "${RED}Error: No supported files found${NC}" >&2 | |
| exit 1 | |
| fi | |
| echo -e "${CYAN}Found ${#all_files[@]} file(s) to process${NC}" >&2 | |
| # Choose processing method based on availability of GNU parallel | |
| if has_parallel && [ ${#all_files[@]} -gt 3 ]; then | |
| echo -e "${CYAN}Using parallel processing (6 concurrent jobs)${NC}" >&2 | |
| echo "" >&2 | |
| process_files_parallel "${all_files[@]}" | |
| else | |
| if [ ${#all_files[@]} -gt 3 ] && ! has_parallel; then | |
| echo -e "${YELLOW}Note: Install GNU parallel for faster processing${NC}" >&2 | |
| fi | |
| echo "" >&2 | |
| # Sequential processing | |
| for file in "${all_files[@]}"; do | |
| echo -e "${BLUE}Processing${NC}: $file" >&2 | |
| local file_tokens | |
| if file_tokens=$(count_tokens "$file"); then | |
| files+=("$file") | |
| token_counts+=("$file_tokens") | |
| total_tokens=$((total_tokens + file_tokens)) | |
| ((successful_files++)) | |
| fi | |
| done | |
| fi | |
| # Display results in tree format | |
| if [ $successful_files -gt 0 ]; then | |
| display_tree_table "$total_tokens" | |
| fi | |
| } | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment