Skip to content

Instantly share code, notes, and snippets.

@andrewssobral
Created July 31, 2024 12:56
Show Gist options
  • Save andrewssobral/f0a693dc1cb340e9e712d0ba994d2bc3 to your computer and use it in GitHub Desktop.
Save andrewssobral/f0a693dc1cb340e9e712d0ba994d2bc3 to your computer and use it in GitHub Desktop.
Bash Script for Codebase Generation to be used with LLMs
#!/bin/bash
# set -x # Enable debug mode
set -e # Exit immediately if a command exits with a non-zero status.
echo "Script started" >&2
# Function to get absolute path (works on macOS and Linux)
get_abs_path() {
local path="$1"
if [[ "$OSTYPE" == "darwin"* ]]; then
[[ $path = /* ]] && echo "$path" || echo "$PWD/${path#./}"
else
readlink -f "$path"
fi
}
# Function to print debug messages
debug() {
if [ "$verbose" = true ]; then
echo "Debug: $1" >&2
fi
}
# Function to display usage information
usage() {
echo "Usage: $0 <project_name> [options]"
echo "Options:"
echo " --exclude-dirs dir1,dir2,... Comma-separated list of directories to exclude"
echo " (default: .git,.vscode,build,dist,node_modules)"
echo " --max-file-size SIZE Maximum file size to include (in bytes, default: 1000000)"
echo " --follow-symlinks Follow symbolic links (default: false)"
echo " --generate-checksums Generate and include file checksums (default: false)"
echo " --output-format FORMAT Output format: text or json (default: text)"
echo " --output-file FILE Specify the output file (default: <project_name>.codebase.<format>)"
echo " --parallel Use parallel processing (requires GNU Parallel, default: false)"
echo " --verbose Enable verbose output"
echo " -h, --help Display this help message"
exit 1
}
# Default values
exclude_dirs=(".git" ".vscode" "build" "dist" "node_modules")
verbose=false
max_file_size=1000000 # 1MB default
follow_symlinks=false
generate_checksums=false
output_format="text"
use_parallel=false
output_file=""
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--exclude-dirs)
IFS=',' read -ra exclude_dirs <<< "$2"
shift 2
;;
--max-file-size)
max_file_size=$2
shift 2
;;
--follow-symlinks)
follow_symlinks=true
shift
;;
--generate-checksums)
generate_checksums=true
shift
;;
--output-format)
output_format=$2
shift 2
;;
--output-file)
output_file=$2
shift 2
;;
--parallel)
use_parallel=true
shift
;;
--verbose)
verbose=true
shift
;;
-h|--help)
usage
;;
*)
if [ -z "$project_name" ]; then
project_name=$1
else
echo "Error: Unexpected argument $1"
usage
fi
shift
;;
esac
done
# Check if project name is provided
if [ -z "$project_name" ]; then
echo "Error: No project name provided"
usage
fi
# Validate output format
if [[ "$output_format" != "text" && "$output_format" != "json" ]]; then
echo "Error: Invalid output format. Use 'text' or 'json'."
exit 1
fi
# Check for GNU Parallel if parallel processing is requested
if [ "$use_parallel" = true ] && ! command -v parallel &> /dev/null; then
echo "Error: GNU Parallel is not installed. Please install it or run without --parallel option."
exit 1
fi
# Check for jq if JSON output is requested
if [ "$output_format" = "json" ] && ! command -v jq &> /dev/null; then
echo "Error: 'jq' command is not available. Please install it for JSON output."
exit 1
fi
# Convert project name to uppercase
project_name_upper=$(echo "$project_name" | tr '[:lower:]' '[:upper:]')
# Set output file if not specified
if [ -z "$output_file" ]; then
output_file="$(pwd)/${project_name}.codebase.${output_format}"
fi
output_file_path=$(get_abs_path "$output_file")
debug "Project name: $project_name"
debug "Current working directory: $(pwd)"
debug "Output file path: $output_file"
debug "Excluded directories: ${exclude_dirs[*]}"
debug "Maximum file size: $max_file_size bytes"
debug "Follow symlinks: $follow_symlinks"
debug "Generate checksums: $generate_checksums"
debug "Output format: $output_format"
debug "Use parallel processing: $use_parallel"
# Check if we have write permissions in the output directory
output_dir=$(dirname "$output_file_path")
if [ ! -w "$output_dir" ]; then
echo "Error: No write permission in the output directory: $output_dir"
exit 1
fi
# Check if 'file' command is available
if ! command -v file &> /dev/null; then
echo "Error: 'file' command is not available on this system"
exit 1
fi
# Get the absolute path of the script
script_path=$(get_abs_path "$0")
# Remove existing output file
rm -f "$output_file_path"
# Explicitly create the output file
touch "$output_file_path" || { echo "Error: Unable to create output file"; exit 1; }
debug "Output file created"
# Add descriptive prompt at the beginning of the file
add_descriptive_prompt() {
debug "Adding descriptive prompt"
if [ "$output_format" = "json" ]; then
if ! jq -n \
--arg project "$project_name_upper" \
--arg excluded "${exclude_dirs[*]}" \
'{
project: $project,
description: "This file contains the full codebase of the project.",
excluded_directories: ($excluded | split(" ")),
file_structure: "Each file in the codebase is represented as a JSON object with \"file\", \"content\", and optional \"checksum\" fields."
}' > "$output_file"; then
echo "Error: Failed to create JSON header" >&2
exit 1
fi
else
if ! cat << EOF > "$output_file"
--#-- ${project_name_upper} CODEBASE --#--
This file contains the full codebase of the $project_name.
It includes all source files, excluding those in the following directories:
${exclude_dirs[*]}
File structure:
Each file in the codebase is represented in the following format:
--#-- START /path/to/file --#--
[File contents]
$([ "$generate_checksums" = true ] && echo "Checksum: [SHA256 checksum]")
--#-- END /path/to/file --#--
This structure allows for easy identification of individual files within the codebase.
--#-- END OF PROMPT --#--
EOF
then
echo "Error: Failed to create text header" >&2
exit 1
fi
fi
}
add_descriptive_prompt
# Function to process a file
process_file() {
local file="$1"
debug "Attempting to process file: $file"
# Check if the file is the script itself or the output file
if [[ "$(get_abs_path "$file")" == "$script_path" || "$(get_abs_path "$file")" == "$output_file_path" ]]; then
debug "Skipping the script itself or the output file: $file"
return
fi
# Check if the file is in an excluded directory
for dir in "${exclude_dirs[@]}"; do
if [[ $file == *"/$dir/"* ]]; then
debug "Skipping file in excluded directory: $file"
return
fi
done
# Skip files starting with an underscore or dot
if [[ $(basename "$file") == _* || $(basename "$file") == .* ]]; then
debug "Skipping hidden or underscore file: $file"
return
fi
# Skip specific system files
case $(basename "$file") in
.DS_Store|Thumbs.db|desktop.ini)
debug "Skipping system file: $file"
return
;;
esac
# Check file size
local file_size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null)
if [ "$file_size" -gt "$max_file_size" ]; then
debug "Skipping file exceeding size limit: $file ($file_size bytes)"
return
fi
# Use 'file' command to detect text files
if file "$file" | grep -qE 'text|empty'; then
debug "Processing file: $file"
local content=$(cat "$file")
local checksum=""
if [ "$generate_checksums" = true ]; then
checksum=$(sha256sum "$file" | cut -d' ' -f1)
fi
if [ "$output_format" = "json" ]; then
jq -n --arg file "$file" --arg content "$content" --arg checksum "$checksum" \
'{file: $file, content: $content, checksum: $checksum}' >> "$output_file"
else
{
echo "--#-- START $file --#--"
echo "$content"
if [ -n "$checksum" ]; then
echo "Checksum: $checksum"
fi
echo "--#-- END $file --#--"
echo ""
} >> "$output_file"
fi
echo "$file" >&2 # Print to stderr for progress tracking
else
debug "Skipping non-text file: $file"
fi
}
export -f process_file debug
export exclude_dirs max_file_size generate_checksums output_format output_file verbose
# Construct find command
find_cmd="find ."
for dir in "${exclude_dirs[@]}"; do
find_cmd+=" -not -path './$dir/*'"
done
if [ "$follow_symlinks" = true ]; then
find_cmd+=" -L"
fi
find_cmd+=" -type f"
debug "Constructed find command: $find_cmd"
# Count total files
total_files=$(eval "$find_cmd" | wc -l)
debug "Total files to process: $total_files"
# Process files
if [ "$use_parallel" = true ]; then
eval "$find_cmd" -print0 | parallel -0 --bar process_file
else
eval "$find_cmd" -print0 | while IFS= read -r -d '' file; do
process_file "$file"
processed_count=$((processed_count + 1))
printf "\rProgress: [%-50s] %d%%" "$(printf '#%.0s' $(seq 1 $((processed_count * 50 / total_files))))" $((processed_count * 100 / total_files))
done
fi
echo -e '\nProcessing complete.'
if [ -f "$output_file" ]; then
echo "Codebase has been processed into $output_file"
debug "File size: $(du -h "$output_file" | cut -f1)"
debug "Number of lines in output file: $(wc -l < "$output_file")"
else
echo "Error: Output file was not created"
fi
@andrewssobral
Copy link
Author

Here are some proposals to enhance the script's functionality, efficiency, and robustness:

  1. Error Handling Improvement:
# Function for error handling
   error_exit() {
       echo "Error: $1" >&2
       exit 1
   }

   # Example usage:
   command_that_may_fail || error_exit "Command failed"

   # Use this function throughout the script for consistent error handling

Implement this error handling function and use it throughout the script for more consistent error management.

  1. Input Validation:
    Add more input validation, especially for numeric inputs like max_file_size. For example:

    if ! [[ "$max_file_size" =~ ^[0-9]+$ ]] ; then
        error_exit "max_file_size must be a positive integer"
    fi
  2. Progress Reporting:
    Improve the progress reporting, especially for large codebases. You could use a function to update progress:

    update_progress() {
        local current=$1
        local total=$2
        local percent=$((current * 100 / total))
        printf "\rProgress: [%-50s] %d%%" "$(printf '#%.0s' $(seq 1 $((percent / 2))))" "$percent"
    }
  3. Temporary File Handling:
    Use mktemp for creating temporary files if needed, and ensure they're cleaned up:

    temp_file=$(mktemp)
    trap 'rm -f "$temp_file"' EXIT
  4. Code Modularization:
    Break down some of the longer functions into smaller, more focused functions. This can improve readability and maintainability.

  5. Use mapfile or readarray:
    Instead of IFS and read for parsing the exclude_dirs, you could use mapfile:

    mapfile -d ',' -t exclude_dirs <<< "$2"
  6. Consistent Quoting:
    Ensure all variables are properly quoted, especially in conditions. For example, change if [ $use_parallel = true ] to if [ "$use_parallel" = true ].

  7. Shellcheck Compliance:
    Run the script through shellcheck and address any warnings or suggestions it provides.

  8. Performance Optimization:

process_files_in_batches() {
       local batch_size=1000
       local files=()
       local count=0

       while IFS= read -r -d '' file; do
           files+=("$file")
           ((count++))

           if [ "$count" -eq "$batch_size" ]; then
               printf '%s\0' "${files[@]}" | xargs -0 -n1 -P "$(nproc)" bash -c 'process_file "$@"' _
               files=()
               count=0
           fi
       done

       if [ "${#files[@]}" -gt 0 ]; then
           printf '%s\0' "${files[@]}" | xargs -0 -n1 -P "$(nproc)" bash -c 'process_file "$@"' _
       fi
   }

   # Use this function instead of the current file processing loop
   eval "$find_cmd" -print0 | process_files_in_batches
   

This batch processing approach can significantly speed up the script for large codebases.

  1. Add a Dry Run Option:
    Implement a --dry-run option that shows what the script would do without actually processing files.

  2. Improved File Type Detection:
    Consider using file --mime-type for more accurate file type detection, which could be more reliable than grep.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment