andrewginns · June 3, 2025 16:00
diff --git a/code-listing.sh b/code-listing.sh
 #!/bin/bash

 # In some situations you may want to skip inclusion of markdown files as they might be out of date.
 include_md_files=true

 # Put the contents of the README file at the top of the output.
 prioritize_readme=true

 # Skip files larger than this size (in bytes).
 max_file_size=1048576
 # Maximum total size of the output file (in bytes).
 max_total_size=52428800

 enable_whitelist_files=true
 enable_whitelist_extensions=true
 enable_whitelist_patterns=true

 debug_mode=false

 ########################   EXCLUSION CONFIG   ##################################
 # Files and directories to exclude from processing
 # 
 # PERFORMANCE NOTE: Directory patterns (like 'venv', '.venv') are automatically
 # excluded at the filesystem traversal level for better performance, while 
 # file patterns (like '*.pyc', '*.log') are filtered after files are found.
 #
 exclude_patterns=(
    # Large directories (excluded from traversal for performance)
    'venv' '.venv' '.git' '__pycache__' 'node_modules'
    
    # File types to skip
    '*.csv' '*.pyc' '*.lock' '*.egg-info' '*.log' 
    
    # Hidden files and environment files
    '.*' '.env'
 )
 ######################################################################

 whitelist_files=("Dockerfile" "Makefile" "gcp-config.json" "package.json" "pyproject.toml" "requirements.txt")

 whitelist_extensions=("py" "txt" "toml" "yaml" "yml" "json" "sh")

 whitelist_patterns=(".github/workflows/*.yaml" ".github/workflows/*.yml" "*.cfg" "*.ini")


 declare -A extension_lookup
 for ext in "${whitelist_extensions[@]}"; do
    extension_lookup["$ext"]=1
 done

 declare -A filename_lookup
 for file in "${whitelist_files[@]}"; do
    filename_lookup["$file"]=1
 done

 check_file_size() {
    local filepath="$1"
    local file_size
    
    if [[ "$OSTYPE" == "darwin"* ]]; then
        file_size=$(stat -f%z "$filepath" 2>/dev/null) || return 1
    else
        file_size=$(stat -c%s "$filepath" 2>/dev/null) || return 1
    fi
    
    if [[ $file_size -gt $max_file_size ]]; then
        echo "Skipping large file: $filepath (${file_size} bytes)" >&2
        return 1
    fi
    return 0
 }

 is_text_file() {
    local filepath="$1"
    file -b --mime-type "$filepath" 2>/dev/null | grep -q "^text/"
 }

 safe_cat() {
    local filepath="$1"
    if is_text_file "$filepath"; then
        cat "$filepath"
    else
        echo "[Binary file - content not displayed]"
    fi
 }

 current_folder_name=$(basename "$PWD")
 output_file="${current_folder_name}_code_listing.txt"

 if [[ -f "$output_file" ]]; then
    echo "Warning: $output_file already exists. Creating new file with datestamp."
    timestamp=$(date +"%Y%m%d_%H%M%S")
    output_file="${current_folder_name}_code_listing_${timestamp}.txt"
    echo "New output file: $output_file"
 fi

 current_dir=$(pwd)

 echo "### Folder structure of $current_dir ###" > "$output_file"
 echo "Generated on: $(date)" >> "$output_file"
 echo "" >> "$output_file"

 exclude_string=""
 for pattern in "${exclude_patterns[@]}"; do
    if [[ -n "$exclude_string" ]]; then
        exclude_string="${exclude_string}|${pattern}"
    else
        exclude_string="$pattern"
    fi
 done

 if command -v tree >/dev/null 2>&1; then
    tree -I "$exclude_string" >> "$output_file"
 else
    echo "Warning: tree command not found. Using find as fallback." >&2
    find . -type f | grep -v -E "($(echo "${exclude_patterns[@]}" | tr ' ' '|'))" | sort >> "$output_file"
 fi

 echo "" >> "$output_file"
 echo "### File Contents ###" >> "$output_file"
 echo "" >> "$output_file"

 total_size=0

 should_include_file() {
    local filepath="$1"
    local filename="${filepath##*/}"
    local extension="${filepath##*.}"
    
    [[ -d "$filepath" ]] && return 1
    [[ "$filepath" == .* || "$filepath" == */\.* ]] && return 1
    [[ "$filepath" == "$output_file" ]] && return 1
    [[ "$filepath" == *"_code_listing"*.txt ]] && return 1
    [[ "$filepath" == .env* ]] && return 1
    [[ ! -s "$filepath" ]] && return 1
    
    for pattern in "${exclude_patterns[@]}"; do
        if [[ "$pattern" != *"*"* && "$pattern" != .*\.* ]]; then
            [[ "$filepath" == *"/${pattern}/"* ]] && return 1
            [[ "$filepath" == ${pattern}/* ]] && return 1
        else
            case "$filepath" in
                $pattern) return 1 ;;
            esac
        fi
    done
    
    check_file_size "$filepath" || return 1
    
    [[ -n "${filename_lookup[$filename]}" ]] && return 0
    
    [[ -n "${extension_lookup[$extension]}" ]] && return 0
    
    [[ "$include_md_files" == "true" && "$extension" == "md" ]] && return 0
    
    if [[ "$enable_whitelist_patterns" == "true" ]]; then
        for pattern in "${whitelist_patterns[@]}"; do
            [[ "$filepath" == $pattern ]] && return 0
        done
    fi
    
    return 1
 }

 process_file() {
    local rel_path="$1"
    local file_size=$(stat -f%z "$rel_path" 2>/dev/null || stat -c%s "$rel_path" 2>/dev/null)
    
    if (( total_size + file_size > max_total_size )); then
        echo "Warning: Reached maximum total output size limit. Stopping file processing." >&2
        return 1
    fi
    
    echo "### Contents of: $rel_path ###" >> "$output_file"
    safe_cat "$rel_path" >> "$output_file"
    echo "" >> "$output_file"
    echo "### End of: $rel_path ###" >> "$output_file"
    echo "" >> "$output_file"
    
    total_size=$((total_size + file_size))
    file_count=$((file_count + 1))
    return 0
 }

 declare -a files_to_process=()
 declare -a readme_files=()
 declare -a all_files_found=()

 directory_patterns=()
 file_patterns=()

 for pattern in "${exclude_patterns[@]}"; do
    if [[ "$pattern" != *"*"* && "$pattern" != .*\.* ]]; then
        directory_patterns+=("$pattern")
    else
        file_patterns+=("$pattern")
    fi
 done

 find_args=("." "-type" "f")
 for dir in "${directory_patterns[@]}"; do
    find_args+=("-not" "-path" "./${dir}/*")
    find_args+=("-not" "-path" "*/${dir}/*")
 done
 find_args+=("-print0")

 while IFS= read -r -d '' filepath; do
    rel_path="${filepath#./}"
    all_files_found+=("$rel_path")
 done < <(find "${find_args[@]}")

 if [[ "$debug_mode" == "true" ]]; then
    echo "=== DEBUG: All files found (${#all_files_found[@]} total) ===" >&2
    for file in "${all_files_found[@]}"; do
        echo "Found: $file" >&2
    done
    echo "=== DEBUG: Checking inclusion criteria ===" >&2
 fi

 for rel_path in "${all_files_found[@]}"; do
    if should_include_file "$rel_path"; then
        if [[ "$debug_mode" == "true" ]]; then
            echo "INCLUDE: $rel_path" >&2
        fi
        if [[ "$prioritize_readme" == "true" ]] && [[ $(basename "$rel_path" | tr '[:upper:]' '[:lower:]') =~ ^readme(\.(md|txt|rst))?$ ]]; then
            readme_files+=("$rel_path")
        else
            files_to_process+=("$rel_path")
        fi
    else
        if [[ "$debug_mode" == "true" ]]; then
            echo "EXCLUDE: $rel_path" >&2
        fi
    fi
 done

 if [[ "$debug_mode" == "true" ]]; then
    echo "=== DEBUG: Final file counts ===" >&2
    echo "README files: ${#readme_files[@]}" >&2
    echo "Other files: ${#files_to_process[@]}" >&2
    echo "Total to process: $((${#readme_files[@]} + ${#files_to_process[@]}))" >&2
    echo "=== DEBUG: Starting file processing ===" >&2
 fi

 file_count=0
 if [[ "$prioritize_readme" == "true" ]]; then
    for readme_file in "${readme_files[@]}"; do
        if ! process_file "$readme_file"; then
            break
        fi
    done
 fi

 for file_path in "${files_to_process[@]}"; do
    if ! process_file "$file_path"; then
        break
    fi
 done

 echo "### Summary ###"
 echo "Total files processed: $file_count"
 echo "Total output size: $total_size bytes"
 echo "Generated on: $(date)"

 echo "Code listing generated successfully: $output_file"
 echo "Files processed: $file_count"
 echo "Total size: $total_size bytes"
	#!/bin/bash

	# In some situations you may want to skip inclusion of markdown files as they might be out of date.
	include_md_files=true

	# Put the contents of the README file at the top of the output.
	prioritize_readme=true

	# Skip files larger than this size (in bytes).
	max_file_size=1048576
	# Maximum total size of the output file (in bytes).
	max_total_size=52428800

	enable_whitelist_files=true
	enable_whitelist_extensions=true
	enable_whitelist_patterns=true

	debug_mode=false

	######################## EXCLUSION CONFIG ##################################
	# Files and directories to exclude from processing
	#
	# PERFORMANCE NOTE: Directory patterns (like 'venv', '.venv') are automatically
	# excluded at the filesystem traversal level for better performance, while
	# file patterns (like '.pyc', '.log') are filtered after files are found.
	#
	exclude_patterns=(
	# Large directories (excluded from traversal for performance)
	'venv' '.venv' '.git' '__pycache__' 'node_modules'

	# File types to skip
	'.csv' '.pyc' '.lock' '.egg-info' '*.log'

	# Hidden files and environment files
	'.*' '.env'
	)
	######################################################################

	whitelist_files=("Dockerfile" "Makefile" "gcp-config.json" "package.json" "pyproject.toml" "requirements.txt")

	whitelist_extensions=("py" "txt" "toml" "yaml" "yml" "json" "sh")

	whitelist_patterns=(".github/workflows/.yaml" ".github/workflows/.yml" ".cfg" ".ini")


	declare -A extension_lookup
	for ext in "${whitelist_extensions[@]}"; do
	extension_lookup["$ext"]=1
	done

	declare -A filename_lookup
	for file in "${whitelist_files[@]}"; do
	filename_lookup["$file"]=1
	done

	check_file_size() {
	local filepath="$1"
	local file_size

	if [[ "$OSTYPE" == "darwin"* ]]; then
	file_size=$(stat -f%z "$filepath" 2>/dev/null) \|\| return 1
	else
	file_size=$(stat -c%s "$filepath" 2>/dev/null) \|\| return 1
	fi

	if [[ $file_size -gt $max_file_size ]]; then
	echo "Skipping large file: $filepath (${file_size} bytes)" >&2
	return 1
	fi
	return 0
	}

	is_text_file() {
	local filepath="$1"
	file -b --mime-type "$filepath" 2>/dev/null \| grep -q "^text/"
	}

	safe_cat() {
	local filepath="$1"
	if is_text_file "$filepath"; then
	cat "$filepath"
	else
	echo "[Binary file - content not displayed]"
	fi
	}

	current_folder_name=$(basename "$PWD")
	output_file="${current_folder_name}_code_listing.txt"

	if [[ -f "$output_file" ]]; then
	echo "Warning: $output_file already exists. Creating new file with datestamp."
	timestamp=$(date +"%Y%m%d_%H%M%S")
	output_file="${current_folder_name}_code_listing_${timestamp}.txt"
	echo "New output file: $output_file"
	fi

	current_dir=$(pwd)

	echo "### Folder structure of $current_dir ###" > "$output_file"
	echo "Generated on: $(date)" >> "$output_file"
	echo "" >> "$output_file"

	exclude_string=""
	for pattern in "${exclude_patterns[@]}"; do
	if [[ -n "$exclude_string" ]]; then
	exclude_string="${exclude_string}\|${pattern}"
	else
	exclude_string="$pattern"
	fi
	done

	if command -v tree >/dev/null 2>&1; then
	tree -I "$exclude_string" >> "$output_file"
	else
	echo "Warning: tree command not found. Using find as fallback." >&2
	find . -type f \| grep -v -E "($(echo "${exclude_patterns[@]}" \| tr ' ' '\|'))" \| sort >> "$output_file"
	fi

	echo "" >> "$output_file"
	echo "### File Contents ###" >> "$output_file"
	echo "" >> "$output_file"

	total_size=0

	should_include_file() {
	local filepath="$1"
	local filename="${filepath##*/}"
	local extension="${filepath##*.}"

	[[ -d "$filepath" ]] && return 1
	[[ "$filepath" == .* \|\| "$filepath" == /\. ]] && return 1
	[[ "$filepath" == "$output_file" ]] && return 1
	[[ "$filepath" == "_code_listing".txt ]] && return 1
	[[ "$filepath" == .env* ]] && return 1
	[[ ! -s "$filepath" ]] && return 1

	for pattern in "${exclude_patterns[@]}"; do
	if [[ "$pattern" != ""* && "$pattern" != .\. ]]; then
	[[ "$filepath" == "/${pattern}/" ]] && return 1
	[[ "$filepath" == ${pattern}/* ]] && return 1
	else
	case "$filepath" in
	$pattern) return 1 ;;
	esac
	fi
	done

	check_file_size "$filepath" \|\| return 1

	[[ -n "${filename_lookup[$filename]}" ]] && return 0

	[[ -n "${extension_lookup[$extension]}" ]] && return 0

	[[ "$include_md_files" == "true" && "$extension" == "md" ]] && return 0

	if [[ "$enable_whitelist_patterns" == "true" ]]; then
	for pattern in "${whitelist_patterns[@]}"; do
	[[ "$filepath" == $pattern ]] && return 0
	done
	fi

	return 1
	}

	process_file() {
	local rel_path="$1"
	local file_size=$(stat -f%z "$rel_path" 2>/dev/null \|\| stat -c%s "$rel_path" 2>/dev/null)

	if (( total_size + file_size > max_total_size )); then
	echo "Warning: Reached maximum total output size limit. Stopping file processing." >&2
	return 1
	fi

	echo "### Contents of: $rel_path ###" >> "$output_file"
	safe_cat "$rel_path" >> "$output_file"
	echo "" >> "$output_file"
	echo "### End of: $rel_path ###" >> "$output_file"
	echo "" >> "$output_file"

	total_size=$((total_size + file_size))
	file_count=$((file_count + 1))
	return 0
	}

	declare -a files_to_process=()
	declare -a readme_files=()
	declare -a all_files_found=()

	directory_patterns=()
	file_patterns=()

	for pattern in "${exclude_patterns[@]}"; do
	if [[ "$pattern" != ""* && "$pattern" != .\. ]]; then
	directory_patterns+=("$pattern")
	else
	file_patterns+=("$pattern")
	fi
	done

	find_args=("." "-type" "f")
	for dir in "${directory_patterns[@]}"; do
	find_args+=("-not" "-path" "./${dir}/*")
	find_args+=("-not" "-path" "/${dir}/")
	done
	find_args+=("-print0")

	while IFS= read -r -d '' filepath; do
	rel_path="${filepath#./}"
	all_files_found+=("$rel_path")
	done < <(find "${find_args[@]}")

	if [[ "$debug_mode" == "true" ]]; then
	echo "=== DEBUG: All files found (${#all_files_found[@]} total) ===" >&2
	for file in "${all_files_found[@]}"; do
	echo "Found: $file" >&2
	done
	echo "=== DEBUG: Checking inclusion criteria ===" >&2
	fi

	for rel_path in "${all_files_found[@]}"; do
	if should_include_file "$rel_path"; then
	if [[ "$debug_mode" == "true" ]]; then
	echo "INCLUDE: $rel_path" >&2
	fi
	if [[ "$prioritize_readme" == "true" ]] && [[ $(basename "$rel_path" \| tr '[:upper:]' '[:lower:]') =~ ^readme(\.(md\|txt\|rst))?$ ]]; then
	readme_files+=("$rel_path")
	else
	files_to_process+=("$rel_path")
	fi
	else
	if [[ "$debug_mode" == "true" ]]; then
	echo "EXCLUDE: $rel_path" >&2
	fi
	fi
	done

	if [[ "$debug_mode" == "true" ]]; then
	echo "=== DEBUG: Final file counts ===" >&2
	echo "README files: ${#readme_files[@]}" >&2
	echo "Other files: ${#files_to_process[@]}" >&2
	echo "Total to process: $((${#readme_files[@]} + ${#files_to_process[@]}))" >&2
	echo "=== DEBUG: Starting file processing ===" >&2
	fi

	file_count=0
	if [[ "$prioritize_readme" == "true" ]]; then
	for readme_file in "${readme_files[@]}"; do
	if ! process_file "$readme_file"; then
	break
	fi
	done
	fi

	for file_path in "${files_to_process[@]}"; do
	if ! process_file "$file_path"; then
	break
	fi
	done

	echo "### Summary ###"
	echo "Total files processed: $file_count"
	echo "Total output size: $total_size bytes"
	echo "Generated on: $(date)"

	echo "Code listing generated successfully: $output_file"
	echo "Files processed: $file_count"
	echo "Total size: $total_size bytes"