Created
June 3, 2025 16:00
-
-
Save andrewginns/01018427242575b07c8e0968b1a8f19b to your computer and use it in GitHub Desktop.
Convert Codebase to LLM prompt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# In some situations you may want to skip inclusion of markdown files as they might be out of date. | |
include_md_files=true | |
# Put the contents of the README file at the top of the output. | |
prioritize_readme=true | |
# Skip files larger than this size (in bytes). | |
max_file_size=1048576 | |
# Maximum total size of the output file (in bytes). | |
max_total_size=52428800 | |
enable_whitelist_files=true | |
enable_whitelist_extensions=true | |
enable_whitelist_patterns=true | |
debug_mode=false | |
######################## EXCLUSION CONFIG ################################## | |
# Files and directories to exclude from processing | |
# | |
# PERFORMANCE NOTE: Directory patterns (like 'venv', '.venv') are automatically | |
# excluded at the filesystem traversal level for better performance, while | |
# file patterns (like '*.pyc', '*.log') are filtered after files are found. | |
# | |
exclude_patterns=( | |
# Large directories (excluded from traversal for performance) | |
'venv' '.venv' '.git' '__pycache__' 'node_modules' | |
# File types to skip | |
'*.csv' '*.pyc' '*.lock' '*.egg-info' '*.log' | |
# Hidden files and environment files | |
'.*' '.env' | |
) | |
###################################################################### | |
whitelist_files=("Dockerfile" "Makefile" "gcp-config.json" "package.json" "pyproject.toml" "requirements.txt") | |
whitelist_extensions=("py" "txt" "toml" "yaml" "yml" "json" "sh") | |
whitelist_patterns=(".github/workflows/*.yaml" ".github/workflows/*.yml" "*.cfg" "*.ini") | |
declare -A extension_lookup | |
for ext in "${whitelist_extensions[@]}"; do | |
extension_lookup["$ext"]=1 | |
done | |
declare -A filename_lookup | |
for file in "${whitelist_files[@]}"; do | |
filename_lookup["$file"]=1 | |
done | |
check_file_size() { | |
local filepath="$1" | |
local file_size | |
if [[ "$OSTYPE" == "darwin"* ]]; then | |
file_size=$(stat -f%z "$filepath" 2>/dev/null) || return 1 | |
else | |
file_size=$(stat -c%s "$filepath" 2>/dev/null) || return 1 | |
fi | |
if [[ $file_size -gt $max_file_size ]]; then | |
echo "Skipping large file: $filepath (${file_size} bytes)" >&2 | |
return 1 | |
fi | |
return 0 | |
} | |
is_text_file() { | |
local filepath="$1" | |
file -b --mime-type "$filepath" 2>/dev/null | grep -q "^text/" | |
} | |
safe_cat() { | |
local filepath="$1" | |
if is_text_file "$filepath"; then | |
cat "$filepath" | |
else | |
echo "[Binary file - content not displayed]" | |
fi | |
} | |
current_folder_name=$(basename "$PWD") | |
output_file="${current_folder_name}_code_listing.txt" | |
if [[ -f "$output_file" ]]; then | |
echo "Warning: $output_file already exists. Creating new file with datestamp." | |
timestamp=$(date +"%Y%m%d_%H%M%S") | |
output_file="${current_folder_name}_code_listing_${timestamp}.txt" | |
echo "New output file: $output_file" | |
fi | |
current_dir=$(pwd) | |
echo "### Folder structure of $current_dir ###" > "$output_file" | |
echo "Generated on: $(date)" >> "$output_file" | |
echo "" >> "$output_file" | |
exclude_string="" | |
for pattern in "${exclude_patterns[@]}"; do | |
if [[ -n "$exclude_string" ]]; then | |
exclude_string="${exclude_string}|${pattern}" | |
else | |
exclude_string="$pattern" | |
fi | |
done | |
if command -v tree >/dev/null 2>&1; then | |
tree -I "$exclude_string" >> "$output_file" | |
else | |
echo "Warning: tree command not found. Using find as fallback." >&2 | |
find . -type f | grep -v -E "($(echo "${exclude_patterns[@]}" | tr ' ' '|'))" | sort >> "$output_file" | |
fi | |
echo "" >> "$output_file" | |
echo "### File Contents ###" >> "$output_file" | |
echo "" >> "$output_file" | |
total_size=0 | |
should_include_file() { | |
local filepath="$1" | |
local filename="${filepath##*/}" | |
local extension="${filepath##*.}" | |
[[ -d "$filepath" ]] && return 1 | |
[[ "$filepath" == .* || "$filepath" == */\.* ]] && return 1 | |
[[ "$filepath" == "$output_file" ]] && return 1 | |
[[ "$filepath" == *"_code_listing"*.txt ]] && return 1 | |
[[ "$filepath" == .env* ]] && return 1 | |
[[ ! -s "$filepath" ]] && return 1 | |
for pattern in "${exclude_patterns[@]}"; do | |
if [[ "$pattern" != *"*"* && "$pattern" != .*\.* ]]; then | |
[[ "$filepath" == *"/${pattern}/"* ]] && return 1 | |
[[ "$filepath" == ${pattern}/* ]] && return 1 | |
else | |
case "$filepath" in | |
$pattern) return 1 ;; | |
esac | |
fi | |
done | |
check_file_size "$filepath" || return 1 | |
[[ -n "${filename_lookup[$filename]}" ]] && return 0 | |
[[ -n "${extension_lookup[$extension]}" ]] && return 0 | |
[[ "$include_md_files" == "true" && "$extension" == "md" ]] && return 0 | |
if [[ "$enable_whitelist_patterns" == "true" ]]; then | |
for pattern in "${whitelist_patterns[@]}"; do | |
[[ "$filepath" == $pattern ]] && return 0 | |
done | |
fi | |
return 1 | |
} | |
process_file() { | |
local rel_path="$1" | |
local file_size=$(stat -f%z "$rel_path" 2>/dev/null || stat -c%s "$rel_path" 2>/dev/null) | |
if (( total_size + file_size > max_total_size )); then | |
echo "Warning: Reached maximum total output size limit. Stopping file processing." >&2 | |
return 1 | |
fi | |
echo "### Contents of: $rel_path ###" >> "$output_file" | |
safe_cat "$rel_path" >> "$output_file" | |
echo "" >> "$output_file" | |
echo "### End of: $rel_path ###" >> "$output_file" | |
echo "" >> "$output_file" | |
total_size=$((total_size + file_size)) | |
file_count=$((file_count + 1)) | |
return 0 | |
} | |
declare -a files_to_process=() | |
declare -a readme_files=() | |
declare -a all_files_found=() | |
directory_patterns=() | |
file_patterns=() | |
for pattern in "${exclude_patterns[@]}"; do | |
if [[ "$pattern" != *"*"* && "$pattern" != .*\.* ]]; then | |
directory_patterns+=("$pattern") | |
else | |
file_patterns+=("$pattern") | |
fi | |
done | |
find_args=("." "-type" "f") | |
for dir in "${directory_patterns[@]}"; do | |
find_args+=("-not" "-path" "./${dir}/*") | |
find_args+=("-not" "-path" "*/${dir}/*") | |
done | |
find_args+=("-print0") | |
while IFS= read -r -d '' filepath; do | |
rel_path="${filepath#./}" | |
all_files_found+=("$rel_path") | |
done < <(find "${find_args[@]}") | |
if [[ "$debug_mode" == "true" ]]; then | |
echo "=== DEBUG: All files found (${#all_files_found[@]} total) ===" >&2 | |
for file in "${all_files_found[@]}"; do | |
echo "Found: $file" >&2 | |
done | |
echo "=== DEBUG: Checking inclusion criteria ===" >&2 | |
fi | |
for rel_path in "${all_files_found[@]}"; do | |
if should_include_file "$rel_path"; then | |
if [[ "$debug_mode" == "true" ]]; then | |
echo "INCLUDE: $rel_path" >&2 | |
fi | |
if [[ "$prioritize_readme" == "true" ]] && [[ $(basename "$rel_path" | tr '[:upper:]' '[:lower:]') =~ ^readme(\.(md|txt|rst))?$ ]]; then | |
readme_files+=("$rel_path") | |
else | |
files_to_process+=("$rel_path") | |
fi | |
else | |
if [[ "$debug_mode" == "true" ]]; then | |
echo "EXCLUDE: $rel_path" >&2 | |
fi | |
fi | |
done | |
if [[ "$debug_mode" == "true" ]]; then | |
echo "=== DEBUG: Final file counts ===" >&2 | |
echo "README files: ${#readme_files[@]}" >&2 | |
echo "Other files: ${#files_to_process[@]}" >&2 | |
echo "Total to process: $((${#readme_files[@]} + ${#files_to_process[@]}))" >&2 | |
echo "=== DEBUG: Starting file processing ===" >&2 | |
fi | |
file_count=0 | |
if [[ "$prioritize_readme" == "true" ]]; then | |
for readme_file in "${readme_files[@]}"; do | |
if ! process_file "$readme_file"; then | |
break | |
fi | |
done | |
fi | |
for file_path in "${files_to_process[@]}"; do | |
if ! process_file "$file_path"; then | |
break | |
fi | |
done | |
echo "### Summary ###" | |
echo "Total files processed: $file_count" | |
echo "Total output size: $total_size bytes" | |
echo "Generated on: $(date)" | |
echo "Code listing generated successfully: $output_file" | |
echo "Files processed: $file_count" | |
echo "Total size: $total_size bytes" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment