Skip to content

Instantly share code, notes, and snippets.

@andrewginns
Created June 3, 2025 16:00
Show Gist options
  • Save andrewginns/01018427242575b07c8e0968b1a8f19b to your computer and use it in GitHub Desktop.
Save andrewginns/01018427242575b07c8e0968b1a8f19b to your computer and use it in GitHub Desktop.
Convert Codebase to LLM prompt
#!/bin/bash
# In some situations you may want to skip inclusion of markdown files as they might be out of date.
include_md_files=true
# Put the contents of the README file at the top of the output.
prioritize_readme=true
# Skip files larger than this size (in bytes).
max_file_size=1048576
# Maximum total size of the output file (in bytes).
max_total_size=52428800
enable_whitelist_files=true
enable_whitelist_extensions=true
enable_whitelist_patterns=true
debug_mode=false
######################## EXCLUSION CONFIG ##################################
# Files and directories to exclude from processing
#
# PERFORMANCE NOTE: Directory patterns (like 'venv', '.venv') are automatically
# excluded at the filesystem traversal level for better performance, while
# file patterns (like '*.pyc', '*.log') are filtered after files are found.
#
exclude_patterns=(
# Large directories (excluded from traversal for performance)
'venv' '.venv' '.git' '__pycache__' 'node_modules'
# File types to skip
'*.csv' '*.pyc' '*.lock' '*.egg-info' '*.log'
# Hidden files and environment files
'.*' '.env'
)
######################################################################
whitelist_files=("Dockerfile" "Makefile" "gcp-config.json" "package.json" "pyproject.toml" "requirements.txt")
whitelist_extensions=("py" "txt" "toml" "yaml" "yml" "json" "sh")
whitelist_patterns=(".github/workflows/*.yaml" ".github/workflows/*.yml" "*.cfg" "*.ini")
declare -A extension_lookup
for ext in "${whitelist_extensions[@]}"; do
extension_lookup["$ext"]=1
done
declare -A filename_lookup
for file in "${whitelist_files[@]}"; do
filename_lookup["$file"]=1
done
check_file_size() {
local filepath="$1"
local file_size
if [[ "$OSTYPE" == "darwin"* ]]; then
file_size=$(stat -f%z "$filepath" 2>/dev/null) || return 1
else
file_size=$(stat -c%s "$filepath" 2>/dev/null) || return 1
fi
if [[ $file_size -gt $max_file_size ]]; then
echo "Skipping large file: $filepath (${file_size} bytes)" >&2
return 1
fi
return 0
}
is_text_file() {
local filepath="$1"
file -b --mime-type "$filepath" 2>/dev/null | grep -q "^text/"
}
safe_cat() {
local filepath="$1"
if is_text_file "$filepath"; then
cat "$filepath"
else
echo "[Binary file - content not displayed]"
fi
}
current_folder_name=$(basename "$PWD")
output_file="${current_folder_name}_code_listing.txt"
if [[ -f "$output_file" ]]; then
echo "Warning: $output_file already exists. Creating new file with datestamp."
timestamp=$(date +"%Y%m%d_%H%M%S")
output_file="${current_folder_name}_code_listing_${timestamp}.txt"
echo "New output file: $output_file"
fi
current_dir=$(pwd)
echo "### Folder structure of $current_dir ###" > "$output_file"
echo "Generated on: $(date)" >> "$output_file"
echo "" >> "$output_file"
exclude_string=""
for pattern in "${exclude_patterns[@]}"; do
if [[ -n "$exclude_string" ]]; then
exclude_string="${exclude_string}|${pattern}"
else
exclude_string="$pattern"
fi
done
if command -v tree >/dev/null 2>&1; then
tree -I "$exclude_string" >> "$output_file"
else
echo "Warning: tree command not found. Using find as fallback." >&2
find . -type f | grep -v -E "($(echo "${exclude_patterns[@]}" | tr ' ' '|'))" | sort >> "$output_file"
fi
echo "" >> "$output_file"
echo "### File Contents ###" >> "$output_file"
echo "" >> "$output_file"
total_size=0
should_include_file() {
local filepath="$1"
local filename="${filepath##*/}"
local extension="${filepath##*.}"
[[ -d "$filepath" ]] && return 1
[[ "$filepath" == .* || "$filepath" == */\.* ]] && return 1
[[ "$filepath" == "$output_file" ]] && return 1
[[ "$filepath" == *"_code_listing"*.txt ]] && return 1
[[ "$filepath" == .env* ]] && return 1
[[ ! -s "$filepath" ]] && return 1
for pattern in "${exclude_patterns[@]}"; do
if [[ "$pattern" != *"*"* && "$pattern" != .*\.* ]]; then
[[ "$filepath" == *"/${pattern}/"* ]] && return 1
[[ "$filepath" == ${pattern}/* ]] && return 1
else
case "$filepath" in
$pattern) return 1 ;;
esac
fi
done
check_file_size "$filepath" || return 1
[[ -n "${filename_lookup[$filename]}" ]] && return 0
[[ -n "${extension_lookup[$extension]}" ]] && return 0
[[ "$include_md_files" == "true" && "$extension" == "md" ]] && return 0
if [[ "$enable_whitelist_patterns" == "true" ]]; then
for pattern in "${whitelist_patterns[@]}"; do
[[ "$filepath" == $pattern ]] && return 0
done
fi
return 1
}
process_file() {
local rel_path="$1"
local file_size=$(stat -f%z "$rel_path" 2>/dev/null || stat -c%s "$rel_path" 2>/dev/null)
if (( total_size + file_size > max_total_size )); then
echo "Warning: Reached maximum total output size limit. Stopping file processing." >&2
return 1
fi
echo "### Contents of: $rel_path ###" >> "$output_file"
safe_cat "$rel_path" >> "$output_file"
echo "" >> "$output_file"
echo "### End of: $rel_path ###" >> "$output_file"
echo "" >> "$output_file"
total_size=$((total_size + file_size))
file_count=$((file_count + 1))
return 0
}
declare -a files_to_process=()
declare -a readme_files=()
declare -a all_files_found=()
directory_patterns=()
file_patterns=()
for pattern in "${exclude_patterns[@]}"; do
if [[ "$pattern" != *"*"* && "$pattern" != .*\.* ]]; then
directory_patterns+=("$pattern")
else
file_patterns+=("$pattern")
fi
done
find_args=("." "-type" "f")
for dir in "${directory_patterns[@]}"; do
find_args+=("-not" "-path" "./${dir}/*")
find_args+=("-not" "-path" "*/${dir}/*")
done
find_args+=("-print0")
while IFS= read -r -d '' filepath; do
rel_path="${filepath#./}"
all_files_found+=("$rel_path")
done < <(find "${find_args[@]}")
if [[ "$debug_mode" == "true" ]]; then
echo "=== DEBUG: All files found (${#all_files_found[@]} total) ===" >&2
for file in "${all_files_found[@]}"; do
echo "Found: $file" >&2
done
echo "=== DEBUG: Checking inclusion criteria ===" >&2
fi
for rel_path in "${all_files_found[@]}"; do
if should_include_file "$rel_path"; then
if [[ "$debug_mode" == "true" ]]; then
echo "INCLUDE: $rel_path" >&2
fi
if [[ "$prioritize_readme" == "true" ]] && [[ $(basename "$rel_path" | tr '[:upper:]' '[:lower:]') =~ ^readme(\.(md|txt|rst))?$ ]]; then
readme_files+=("$rel_path")
else
files_to_process+=("$rel_path")
fi
else
if [[ "$debug_mode" == "true" ]]; then
echo "EXCLUDE: $rel_path" >&2
fi
fi
done
if [[ "$debug_mode" == "true" ]]; then
echo "=== DEBUG: Final file counts ===" >&2
echo "README files: ${#readme_files[@]}" >&2
echo "Other files: ${#files_to_process[@]}" >&2
echo "Total to process: $((${#readme_files[@]} + ${#files_to_process[@]}))" >&2
echo "=== DEBUG: Starting file processing ===" >&2
fi
file_count=0
if [[ "$prioritize_readme" == "true" ]]; then
for readme_file in "${readme_files[@]}"; do
if ! process_file "$readme_file"; then
break
fi
done
fi
for file_path in "${files_to_process[@]}"; do
if ! process_file "$file_path"; then
break
fi
done
echo "### Summary ###"
echo "Total files processed: $file_count"
echo "Total output size: $total_size bytes"
echo "Generated on: $(date)"
echo "Code listing generated successfully: $output_file"
echo "Files processed: $file_count"
echo "Total size: $total_size bytes"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment