Created
July 31, 2024 12:56
-
-
Save andrewssobral/f0a693dc1cb340e9e712d0ba994d2bc3 to your computer and use it in GitHub Desktop.
Bash Script for Codebase Generation to be used with LLMs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# set -x # Enable debug mode | |
set -e # Exit immediately if a command exits with a non-zero status. | |
echo "Script started" >&2 | |
# Function to get absolute path (works on macOS and Linux) | |
get_abs_path() { | |
local path="$1" | |
if [[ "$OSTYPE" == "darwin"* ]]; then | |
[[ $path = /* ]] && echo "$path" || echo "$PWD/${path#./}" | |
else | |
readlink -f "$path" | |
fi | |
} | |
# Function to print debug messages | |
debug() { | |
if [ "$verbose" = true ]; then | |
echo "Debug: $1" >&2 | |
fi | |
} | |
# Function to display usage information | |
usage() { | |
echo "Usage: $0 <project_name> [options]" | |
echo "Options:" | |
echo " --exclude-dirs dir1,dir2,... Comma-separated list of directories to exclude" | |
echo " (default: .git,.vscode,build,dist,node_modules)" | |
echo " --max-file-size SIZE Maximum file size to include (in bytes, default: 1000000)" | |
echo " --follow-symlinks Follow symbolic links (default: false)" | |
echo " --generate-checksums Generate and include file checksums (default: false)" | |
echo " --output-format FORMAT Output format: text or json (default: text)" | |
echo " --output-file FILE Specify the output file (default: <project_name>.codebase.<format>)" | |
echo " --parallel Use parallel processing (requires GNU Parallel, default: false)" | |
echo " --verbose Enable verbose output" | |
echo " -h, --help Display this help message" | |
exit 1 | |
} | |
# Default values | |
exclude_dirs=(".git" ".vscode" "build" "dist" "node_modules") | |
verbose=false | |
max_file_size=1000000 # 1MB default | |
follow_symlinks=false | |
generate_checksums=false | |
output_format="text" | |
use_parallel=false | |
output_file="" | |
# Parse command line arguments | |
while [[ $# -gt 0 ]]; do | |
case $1 in | |
--exclude-dirs) | |
IFS=',' read -ra exclude_dirs <<< "$2" | |
shift 2 | |
;; | |
--max-file-size) | |
max_file_size=$2 | |
shift 2 | |
;; | |
--follow-symlinks) | |
follow_symlinks=true | |
shift | |
;; | |
--generate-checksums) | |
generate_checksums=true | |
shift | |
;; | |
--output-format) | |
output_format=$2 | |
shift 2 | |
;; | |
--output-file) | |
output_file=$2 | |
shift 2 | |
;; | |
--parallel) | |
use_parallel=true | |
shift | |
;; | |
--verbose) | |
verbose=true | |
shift | |
;; | |
-h|--help) | |
usage | |
;; | |
*) | |
if [ -z "$project_name" ]; then | |
project_name=$1 | |
else | |
echo "Error: Unexpected argument $1" | |
usage | |
fi | |
shift | |
;; | |
esac | |
done | |
# Check if project name is provided | |
if [ -z "$project_name" ]; then | |
echo "Error: No project name provided" | |
usage | |
fi | |
# Validate output format | |
if [[ "$output_format" != "text" && "$output_format" != "json" ]]; then | |
echo "Error: Invalid output format. Use 'text' or 'json'." | |
exit 1 | |
fi | |
# Check for GNU Parallel if parallel processing is requested | |
if [ "$use_parallel" = true ] && ! command -v parallel &> /dev/null; then | |
echo "Error: GNU Parallel is not installed. Please install it or run without --parallel option." | |
exit 1 | |
fi | |
# Check for jq if JSON output is requested | |
if [ "$output_format" = "json" ] && ! command -v jq &> /dev/null; then | |
echo "Error: 'jq' command is not available. Please install it for JSON output." | |
exit 1 | |
fi | |
# Convert project name to uppercase | |
project_name_upper=$(echo "$project_name" | tr '[:lower:]' '[:upper:]') | |
# Set output file if not specified | |
if [ -z "$output_file" ]; then | |
output_file="$(pwd)/${project_name}.codebase.${output_format}" | |
fi | |
output_file_path=$(get_abs_path "$output_file") | |
debug "Project name: $project_name" | |
debug "Current working directory: $(pwd)" | |
debug "Output file path: $output_file" | |
debug "Excluded directories: ${exclude_dirs[*]}" | |
debug "Maximum file size: $max_file_size bytes" | |
debug "Follow symlinks: $follow_symlinks" | |
debug "Generate checksums: $generate_checksums" | |
debug "Output format: $output_format" | |
debug "Use parallel processing: $use_parallel" | |
# Check if we have write permissions in the output directory | |
output_dir=$(dirname "$output_file_path") | |
if [ ! -w "$output_dir" ]; then | |
echo "Error: No write permission in the output directory: $output_dir" | |
exit 1 | |
fi | |
# Check if 'file' command is available | |
if ! command -v file &> /dev/null; then | |
echo "Error: 'file' command is not available on this system" | |
exit 1 | |
fi | |
# Get the absolute path of the script | |
script_path=$(get_abs_path "$0") | |
# Remove existing output file | |
rm -f "$output_file_path" | |
# Explicitly create the output file | |
touch "$output_file_path" || { echo "Error: Unable to create output file"; exit 1; } | |
debug "Output file created" | |
# Add descriptive prompt at the beginning of the file | |
add_descriptive_prompt() { | |
debug "Adding descriptive prompt" | |
if [ "$output_format" = "json" ]; then | |
if ! jq -n \ | |
--arg project "$project_name_upper" \ | |
--arg excluded "${exclude_dirs[*]}" \ | |
'{ | |
project: $project, | |
description: "This file contains the full codebase of the project.", | |
excluded_directories: ($excluded | split(" ")), | |
file_structure: "Each file in the codebase is represented as a JSON object with \"file\", \"content\", and optional \"checksum\" fields." | |
}' > "$output_file"; then | |
echo "Error: Failed to create JSON header" >&2 | |
exit 1 | |
fi | |
else | |
if ! cat << EOF > "$output_file" | |
--#-- ${project_name_upper} CODEBASE --#-- | |
This file contains the full codebase of the $project_name. | |
It includes all source files, excluding those in the following directories: | |
${exclude_dirs[*]} | |
File structure: | |
Each file in the codebase is represented in the following format: | |
--#-- START /path/to/file --#-- | |
[File contents] | |
$([ "$generate_checksums" = true ] && echo "Checksum: [SHA256 checksum]") | |
--#-- END /path/to/file --#-- | |
This structure allows for easy identification of individual files within the codebase. | |
--#-- END OF PROMPT --#-- | |
EOF | |
then | |
echo "Error: Failed to create text header" >&2 | |
exit 1 | |
fi | |
fi | |
} | |
add_descriptive_prompt | |
# Function to process a file | |
process_file() { | |
local file="$1" | |
debug "Attempting to process file: $file" | |
# Check if the file is the script itself or the output file | |
if [[ "$(get_abs_path "$file")" == "$script_path" || "$(get_abs_path "$file")" == "$output_file_path" ]]; then | |
debug "Skipping the script itself or the output file: $file" | |
return | |
fi | |
# Check if the file is in an excluded directory | |
for dir in "${exclude_dirs[@]}"; do | |
if [[ $file == *"/$dir/"* ]]; then | |
debug "Skipping file in excluded directory: $file" | |
return | |
fi | |
done | |
# Skip files starting with an underscore or dot | |
if [[ $(basename "$file") == _* || $(basename "$file") == .* ]]; then | |
debug "Skipping hidden or underscore file: $file" | |
return | |
fi | |
# Skip specific system files | |
case $(basename "$file") in | |
.DS_Store|Thumbs.db|desktop.ini) | |
debug "Skipping system file: $file" | |
return | |
;; | |
esac | |
# Check file size | |
local file_size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null) | |
if [ "$file_size" -gt "$max_file_size" ]; then | |
debug "Skipping file exceeding size limit: $file ($file_size bytes)" | |
return | |
fi | |
# Use 'file' command to detect text files | |
if file "$file" | grep -qE 'text|empty'; then | |
debug "Processing file: $file" | |
local content=$(cat "$file") | |
local checksum="" | |
if [ "$generate_checksums" = true ]; then | |
checksum=$(sha256sum "$file" | cut -d' ' -f1) | |
fi | |
if [ "$output_format" = "json" ]; then | |
jq -n --arg file "$file" --arg content "$content" --arg checksum "$checksum" \ | |
'{file: $file, content: $content, checksum: $checksum}' >> "$output_file" | |
else | |
{ | |
echo "--#-- START $file --#--" | |
echo "$content" | |
if [ -n "$checksum" ]; then | |
echo "Checksum: $checksum" | |
fi | |
echo "--#-- END $file --#--" | |
echo "" | |
} >> "$output_file" | |
fi | |
echo "$file" >&2 # Print to stderr for progress tracking | |
else | |
debug "Skipping non-text file: $file" | |
fi | |
} | |
export -f process_file debug | |
export exclude_dirs max_file_size generate_checksums output_format output_file verbose | |
# Construct find command | |
find_cmd="find ." | |
for dir in "${exclude_dirs[@]}"; do | |
find_cmd+=" -not -path './$dir/*'" | |
done | |
if [ "$follow_symlinks" = true ]; then | |
find_cmd+=" -L" | |
fi | |
find_cmd+=" -type f" | |
debug "Constructed find command: $find_cmd" | |
# Count total files | |
total_files=$(eval "$find_cmd" | wc -l) | |
debug "Total files to process: $total_files" | |
# Process files | |
if [ "$use_parallel" = true ]; then | |
eval "$find_cmd" -print0 | parallel -0 --bar process_file | |
else | |
eval "$find_cmd" -print0 | while IFS= read -r -d '' file; do | |
process_file "$file" | |
processed_count=$((processed_count + 1)) | |
printf "\rProgress: [%-50s] %d%%" "$(printf '#%.0s' $(seq 1 $((processed_count * 50 / total_files))))" $((processed_count * 100 / total_files)) | |
done | |
fi | |
echo -e '\nProcessing complete.' | |
if [ -f "$output_file" ]; then | |
echo "Codebase has been processed into $output_file" | |
debug "File size: $(du -h "$output_file" | cut -f1)" | |
debug "Number of lines in output file: $(wc -l < "$output_file")" | |
else | |
echo "Error: Output file was not created" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here are some proposals to enhance the script's functionality, efficiency, and robustness:
Implement this error handling function and use it throughout the script for more consistent error management.
Input Validation:
Add more input validation, especially for numeric inputs like
max_file_size
. For example:Progress Reporting:
Improve the progress reporting, especially for large codebases. You could use a function to update progress:
Temporary File Handling:
Use
mktemp
for creating temporary files if needed, and ensure they're cleaned up:Code Modularization:
Break down some of the longer functions into smaller, more focused functions. This can improve readability and maintainability.
Use
mapfile
orreadarray
:Instead of IFS and read for parsing the exclude_dirs, you could use
mapfile
:Consistent Quoting:
Ensure all variables are properly quoted, especially in conditions. For example, change
if [ $use_parallel = true ]
toif [ "$use_parallel" = true ]
.Shellcheck Compliance:
Run the script through shellcheck and address any warnings or suggestions it provides.
Performance Optimization:
This batch processing approach can significantly speed up the script for large codebases.
Add a Dry Run Option:
Implement a
--dry-run
option that shows what the script would do without actually processing files.Improved File Type Detection:
Consider using
file --mime-type
for more accurate file type detection, which could be more reliable than grep.