andrewssobral · July 31, 2024 12:56 · andrewssobral · Jul 31, 2024
diff --git a/generate_codebase.sh b/generate_codebase.sh
 #!/bin/bash

 # set -x  # Enable debug mode
 set -e  # Exit immediately if a command exits with a non-zero status.

 echo "Script started" >&2

 # Function to get absolute path (works on macOS and Linux)
 get_abs_path() {
    local path="$1"
    if [[ "$OSTYPE" == "darwin"* ]]; then
        [[ $path = /* ]] && echo "$path" || echo "$PWD/${path#./}"
    else
        readlink -f "$path"
    fi
 }

 # Function to print debug messages
 debug() {
    if [ "$verbose" = true ]; then
        echo "Debug: $1" >&2
    fi
 }

 # Function to display usage information
 usage() {
    echo "Usage: $0 <project_name> [options]"
    echo "Options:"
    echo "  --exclude-dirs dir1,dir2,...  Comma-separated list of directories to exclude"
    echo "                                (default: .git,.vscode,build,dist,node_modules)"
    echo "  --max-file-size SIZE          Maximum file size to include (in bytes, default: 1000000)"
    echo "  --follow-symlinks             Follow symbolic links (default: false)"
    echo "  --generate-checksums          Generate and include file checksums (default: false)"
    echo "  --output-format FORMAT        Output format: text or json (default: text)"
    echo "  --output-file FILE            Specify the output file (default: <project_name>.codebase.<format>)"
    echo "  --parallel                    Use parallel processing (requires GNU Parallel, default: false)"
    echo "  --verbose                     Enable verbose output"
    echo "  -h, --help                    Display this help message"
    exit 1
 }

 # Default values
 exclude_dirs=(".git" ".vscode" "build" "dist" "node_modules")
 verbose=false
 max_file_size=1000000  # 1MB default
 follow_symlinks=false
 generate_checksums=false
 output_format="text"
 use_parallel=false
 output_file=""

 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
    case $1 in
        --exclude-dirs)
            IFS=',' read -ra exclude_dirs <<< "$2"
            shift 2
            ;;
        --max-file-size)
            max_file_size=$2
            shift 2
            ;;
        --follow-symlinks)
            follow_symlinks=true
            shift
            ;;
        --generate-checksums)
            generate_checksums=true
            shift
            ;;
        --output-format)
            output_format=$2
            shift 2
            ;;
        --output-file)
            output_file=$2
            shift 2
            ;;
        --parallel)
            use_parallel=true
            shift
            ;;
        --verbose)
            verbose=true
            shift
            ;;
        -h|--help)
            usage
            ;;
        *)
            if [ -z "$project_name" ]; then
                project_name=$1
            else
                echo "Error: Unexpected argument $1"
                usage
            fi
            shift
            ;;
    esac
 done

 # Check if project name is provided
 if [ -z "$project_name" ]; then
    echo "Error: No project name provided"
    usage
 fi

 # Validate output format
 if [[ "$output_format" != "text" && "$output_format" != "json" ]]; then
    echo "Error: Invalid output format. Use 'text' or 'json'."
    exit 1
 fi

 # Check for GNU Parallel if parallel processing is requested
 if [ "$use_parallel" = true ] && ! command -v parallel &> /dev/null; then
    echo "Error: GNU Parallel is not installed. Please install it or run without --parallel option."
    exit 1
 fi

 # Check for jq if JSON output is requested
 if [ "$output_format" = "json" ] && ! command -v jq &> /dev/null; then
    echo "Error: 'jq' command is not available. Please install it for JSON output."
    exit 1
 fi

 # Convert project name to uppercase
 project_name_upper=$(echo "$project_name" | tr '[:lower:]' '[:upper:]')

 # Set output file if not specified
 if [ -z "$output_file" ]; then
 output_file="$(pwd)/${project_name}.codebase.${output_format}"
 fi

 output_file_path=$(get_abs_path "$output_file")

 debug "Project name: $project_name"
 debug "Current working directory: $(pwd)"
 debug "Output file path: $output_file"
 debug "Excluded directories: ${exclude_dirs[*]}"
 debug "Maximum file size: $max_file_size bytes"
 debug "Follow symlinks: $follow_symlinks"
 debug "Generate checksums: $generate_checksums"
 debug "Output format: $output_format"
 debug "Use parallel processing: $use_parallel"

 # Check if we have write permissions in the output directory
 output_dir=$(dirname "$output_file_path")
 if [ ! -w "$output_dir" ]; then
    echo "Error: No write permission in the output directory: $output_dir"
    exit 1
 fi

 # Check if 'file' command is available
 if ! command -v file &> /dev/null; then
    echo "Error: 'file' command is not available on this system"
    exit 1
 fi

 # Get the absolute path of the script
 script_path=$(get_abs_path "$0")

 # Remove existing output file
 rm -f "$output_file_path"

 # Explicitly create the output file
 touch "$output_file_path" || { echo "Error: Unable to create output file"; exit 1; }
 debug "Output file created"

 # Add descriptive prompt at the beginning of the file
 add_descriptive_prompt() {
    debug "Adding descriptive prompt"
    if [ "$output_format" = "json" ]; then
        if ! jq -n \
           --arg project "$project_name_upper" \
           --arg excluded "${exclude_dirs[*]}" \
           '{
              project: $project,
              description: "This file contains the full codebase of the project.",
              excluded_directories: ($excluded | split(" ")),
              file_structure: "Each file in the codebase is represented as a JSON object with \"file\", \"content\", and optional \"checksum\" fields."
            }' > "$output_file"; then
            echo "Error: Failed to create JSON header" >&2
            exit 1
        fi
    else
        if ! cat << EOF > "$output_file"
 --#-- ${project_name_upper} CODEBASE --#--

 This file contains the full codebase of the $project_name.
 It includes all source files, excluding those in the following directories:
 ${exclude_dirs[*]}

 File structure:
 Each file in the codebase is represented in the following format:

 --#-- START /path/to/file --#--
 [File contents]
 $([ "$generate_checksums" = true ] && echo "Checksum: [SHA256 checksum]")
 --#-- END /path/to/file --#--

 This structure allows for easy identification of individual files within the codebase.

 --#-- END OF PROMPT --#--

 EOF
        then
            echo "Error: Failed to create text header" >&2
            exit 1
        fi
    fi
 }

 add_descriptive_prompt

 # Function to process a file
 process_file() {
    local file="$1"
    
    debug "Attempting to process file: $file"
    
    # Check if the file is the script itself or the output file
    if [[ "$(get_abs_path "$file")" == "$script_path" || "$(get_abs_path "$file")" == "$output_file_path" ]]; then
        debug "Skipping the script itself or the output file: $file"
        return
    fi
    
    # Check if the file is in an excluded directory
    for dir in "${exclude_dirs[@]}"; do
        if [[ $file == *"/$dir/"* ]]; then
            debug "Skipping file in excluded directory: $file"
            return
        fi
    done
    
    # Skip files starting with an underscore or dot
    if [[ $(basename "$file") == _* || $(basename "$file") == .* ]]; then
        debug "Skipping hidden or underscore file: $file"
        return
    fi
    
    # Skip specific system files
    case $(basename "$file") in
        .DS_Store|Thumbs.db|desktop.ini)
            debug "Skipping system file: $file"
            return
            ;;
    esac
    
    # Check file size
    local file_size=$(stat -f%z "$file" 2>/dev/null || stat -c%s "$file" 2>/dev/null)
    if [ "$file_size" -gt "$max_file_size" ]; then
        debug "Skipping file exceeding size limit: $file ($file_size bytes)"
        return
    fi
    
    # Use 'file' command to detect text files
    if file "$file" | grep -qE 'text|empty'; then
        debug "Processing file: $file"
        
        local content=$(cat "$file")
        local checksum=""
        if [ "$generate_checksums" = true ]; then
            checksum=$(sha256sum "$file" | cut -d' ' -f1)
        fi
        
        if [ "$output_format" = "json" ]; then
            jq -n --arg file "$file" --arg content "$content" --arg checksum "$checksum" \
               '{file: $file, content: $content, checksum: $checksum}' >> "$output_file"
        else
            {
                echo "--#-- START $file --#--"
                echo "$content"
                if [ -n "$checksum" ]; then
                    echo "Checksum: $checksum"
                fi
                echo "--#-- END $file --#--"
                echo ""
            } >> "$output_file"
        fi
        
        echo "$file" >&2  # Print to stderr for progress tracking
    else
        debug "Skipping non-text file: $file"
    fi
 }

 export -f process_file debug
 export exclude_dirs max_file_size generate_checksums output_format output_file verbose

 # Construct find command
 find_cmd="find ."
 for dir in "${exclude_dirs[@]}"; do
    find_cmd+=" -not -path './$dir/*'"
 done
 if [ "$follow_symlinks" = true ]; then
    find_cmd+=" -L"
 fi
 find_cmd+=" -type f"
 debug "Constructed find command: $find_cmd"

 # Count total files
 total_files=$(eval "$find_cmd" | wc -l)
 debug "Total files to process: $total_files"

 # Process files
 if [ "$use_parallel" = true ]; then
    eval "$find_cmd" -print0 | parallel -0 --bar process_file
 else
    eval "$find_cmd" -print0 | while IFS= read -r -d '' file; do
        process_file "$file"
        processed_count=$((processed_count + 1))
        printf "\rProgress: [%-50s] %d%%" "$(printf '#%.0s' $(seq 1 $((processed_count * 50 / total_files))))" $((processed_count * 100 / total_files))
    done
 fi

 echo -e '\nProcessing complete.'

 if [ -f "$output_file" ]; then
    echo "Codebase has been processed into $output_file"
    debug "File size: $(du -h "$output_file" | cut -f1)"
    debug "Number of lines in output file: $(wc -l < "$output_file")"
 else
    echo "Error: Output file was not created"
 fi
	#!/bin/bash

	# set -x # Enable debug mode
	set -e # Exit immediately if a command exits with a non-zero status.

	echo "Script started" >&2

	# Function to get absolute path (works on macOS and Linux)
	get_abs_path() {
	local path="$1"
	if [[ "$OSTYPE" == "darwin"* ]]; then
	[[ $path = /* ]] && echo "$path" \|\| echo "$PWD/${path#./}"
	else
	readlink -f "$path"
	fi
	}

	# Function to print debug messages
	debug() {
	if [ "$verbose" = true ]; then
	echo "Debug: $1" >&2
	fi
	}

	# Function to display usage information
	usage() {
	echo "Usage: $0 <project_name> [options]"
	echo "Options:"
	echo " --exclude-dirs dir1,dir2,... Comma-separated list of directories to exclude"
	echo " (default: .git,.vscode,build,dist,node_modules)"
	echo " --max-file-size SIZE Maximum file size to include (in bytes, default: 1000000)"
	echo " --follow-symlinks Follow symbolic links (default: false)"
	echo " --generate-checksums Generate and include file checksums (default: false)"
	echo " --output-format FORMAT Output format: text or json (default: text)"
	echo " --output-file FILE Specify the output file (default: <project_name>.codebase.<format>)"
	echo " --parallel Use parallel processing (requires GNU Parallel, default: false)"
	echo " --verbose Enable verbose output"
	echo " -h, --help Display this help message"
	exit 1
	}

	# Default values
	exclude_dirs=(".git" ".vscode" "build" "dist" "node_modules")
	verbose=false
	max_file_size=1000000 # 1MB default
	follow_symlinks=false
	generate_checksums=false
	output_format="text"
	use_parallel=false
	output_file=""

	# Parse command line arguments
	while [[ $# -gt 0 ]]; do
	case $1 in
	--exclude-dirs)
	IFS=',' read -ra exclude_dirs <<< "$2"
	shift 2
	;;
	--max-file-size)
	max_file_size=$2
	shift 2
	;;
	--follow-symlinks)
	follow_symlinks=true
	shift
	;;
	--generate-checksums)
	generate_checksums=true
	shift
	;;
	--output-format)
	output_format=$2
	shift 2
	;;
	--output-file)
	output_file=$2
	shift 2
	;;
	--parallel)
	use_parallel=true
	shift
	;;
	--verbose)
	verbose=true
	shift
	;;
	-h\|--help)
	usage
	;;
	*)
	if [ -z "$project_name" ]; then
	project_name=$1
	else
	echo "Error: Unexpected argument $1"
	usage
	fi
	shift
	;;
	esac
	done

	# Check if project name is provided
	if [ -z "$project_name" ]; then
	echo "Error: No project name provided"
	usage
	fi

	# Validate output format
	if [[ "$output_format" != "text" && "$output_format" != "json" ]]; then
	echo "Error: Invalid output format. Use 'text' or 'json'."
	exit 1
	fi

	# Check for GNU Parallel if parallel processing is requested
	if [ "$use_parallel" = true ] && ! command -v parallel &> /dev/null; then
	echo "Error: GNU Parallel is not installed. Please install it or run without --parallel option."
	exit 1
	fi

	# Check for jq if JSON output is requested
	if [ "$output_format" = "json" ] && ! command -v jq &> /dev/null; then
	echo "Error: 'jq' command is not available. Please install it for JSON output."
	exit 1
	fi

	# Convert project name to uppercase
	project_name_upper=$(echo "$project_name" \| tr '[:lower:]' '[:upper:]')

	# Set output file if not specified
	if [ -z "$output_file" ]; then
	output_file="$(pwd)/${project_name}.codebase.${output_format}"
	fi

	output_file_path=$(get_abs_path "$output_file")

	debug "Project name: $project_name"
	debug "Current working directory: $(pwd)"
	debug "Output file path: $output_file"
	debug "Excluded directories: ${exclude_dirs[*]}"
	debug "Maximum file size: $max_file_size bytes"
	debug "Follow symlinks: $follow_symlinks"
	debug "Generate checksums: $generate_checksums"
	debug "Output format: $output_format"
	debug "Use parallel processing: $use_parallel"

	# Check if we have write permissions in the output directory
	output_dir=$(dirname "$output_file_path")
	if [ ! -w "$output_dir" ]; then
	echo "Error: No write permission in the output directory: $output_dir"
	exit 1
	fi

	# Check if 'file' command is available
	if ! command -v file &> /dev/null; then
	echo "Error: 'file' command is not available on this system"
	exit 1
	fi

	# Get the absolute path of the script
	script_path=$(get_abs_path "$0")

	# Remove existing output file
	rm -f "$output_file_path"

	# Explicitly create the output file
	touch "$output_file_path" \|\| { echo "Error: Unable to create output file"; exit 1; }
	debug "Output file created"

	# Add descriptive prompt at the beginning of the file
	add_descriptive_prompt() {
	debug "Adding descriptive prompt"
	if [ "$output_format" = "json" ]; then
	if ! jq -n \
	--arg project "$project_name_upper" \
	--arg excluded "${exclude_dirs[*]}" \
	'{
	project: $project,
	description: "This file contains the full codebase of the project.",
	excluded_directories: ($excluded \| split(" ")),
	file_structure: "Each file in the codebase is represented as a JSON object with \"file\", \"content\", and optional \"checksum\" fields."
	}' > "$output_file"; then
	echo "Error: Failed to create JSON header" >&2
	exit 1
	fi
	else
	if ! cat << EOF > "$output_file"
	--#-- ${project_name_upper} CODEBASE --#--

	This file contains the full codebase of the $project_name.
	It includes all source files, excluding those in the following directories:
	${exclude_dirs[*]}

	File structure:
	Each file in the codebase is represented in the following format:

	--#-- START /path/to/file --#--
	[File contents]
	$([ "$generate_checksums" = true ] && echo "Checksum: [SHA256 checksum]")
	--#-- END /path/to/file --#--

	This structure allows for easy identification of individual files within the codebase.

	--#-- END OF PROMPT --#--

	EOF
	then
	echo "Error: Failed to create text header" >&2
	exit 1
	fi
	fi
	}

	add_descriptive_prompt

	# Function to process a file
	process_file() {
	local file="$1"

	debug "Attempting to process file: $file"

	# Check if the file is the script itself or the output file
	if [[ "$(get_abs_path "$file")" == "$script_path" \|\| "$(get_abs_path "$file")" == "$output_file_path" ]]; then
	debug "Skipping the script itself or the output file: $file"
	return
	fi

	# Check if the file is in an excluded directory
	for dir in "${exclude_dirs[@]}"; do
	if [[ $file == "/$dir/" ]]; then
	debug "Skipping file in excluded directory: $file"
	return
	fi
	done

	# Skip files starting with an underscore or dot
	if [[ $(basename "$file") == _* \|\| $(basename "$file") == .* ]]; then
	debug "Skipping hidden or underscore file: $file"
	return
	fi

	# Skip specific system files
	case $(basename "$file") in
	.DS_Store\|Thumbs.db\|desktop.ini)
	debug "Skipping system file: $file"
	return
	;;
	esac

	# Check file size
	local file_size=$(stat -f%z "$file" 2>/dev/null \|\| stat -c%s "$file" 2>/dev/null)
	if [ "$file_size" -gt "$max_file_size" ]; then
	debug "Skipping file exceeding size limit: $file ($file_size bytes)"
	return
	fi

	# Use 'file' command to detect text files
	if file "$file" \| grep -qE 'text\|empty'; then
	debug "Processing file: $file"

	local content=$(cat "$file")
	local checksum=""
	if [ "$generate_checksums" = true ]; then
	checksum=$(sha256sum "$file" \| cut -d' ' -f1)
	fi

	if [ "$output_format" = "json" ]; then
	jq -n --arg file "$file" --arg content "$content" --arg checksum "$checksum" \
	'{file: $file, content: $content, checksum: $checksum}' >> "$output_file"
	else
	{
	echo "--#-- START $file --#--"
	echo "$content"
	if [ -n "$checksum" ]; then
	echo "Checksum: $checksum"
	fi
	echo "--#-- END $file --#--"
	echo ""
	} >> "$output_file"
	fi

	echo "$file" >&2 # Print to stderr for progress tracking
	else
	debug "Skipping non-text file: $file"
	fi
	}

	export -f process_file debug
	export exclude_dirs max_file_size generate_checksums output_format output_file verbose

	# Construct find command
	find_cmd="find ."
	for dir in "${exclude_dirs[@]}"; do
	find_cmd+=" -not -path './$dir/*'"
	done
	if [ "$follow_symlinks" = true ]; then
	find_cmd+=" -L"
	fi
	find_cmd+=" -type f"
	debug "Constructed find command: $find_cmd"

	# Count total files
	total_files=$(eval "$find_cmd" \| wc -l)
	debug "Total files to process: $total_files"

	# Process files
	if [ "$use_parallel" = true ]; then
	eval "$find_cmd" -print0 \| parallel -0 --bar process_file
	else
	eval "$find_cmd" -print0 \| while IFS= read -r -d '' file; do
	process_file "$file"
	processed_count=$((processed_count + 1))
	printf "\rProgress: [%-50s] %d%%" "$(printf '#%.0s' $(seq 1 $((processed_count * 50 / total_files))))" $((processed_count * 100 / total_files))
	done
	fi

	echo -e '\nProcessing complete.'

	if [ -f "$output_file" ]; then
	echo "Codebase has been processed into $output_file"
	debug "File size: $(du -h "$output_file" \| cut -f1)"
	debug "Number of lines in output file: $(wc -l < "$output_file")"
	else
	echo "Error: Output file was not created"
	fi