Created
April 5, 2025 15:15
-
-
Save somera/e30d1d7dc3fec1b32543240142730d1f to your computer and use it in GitHub Desktop.
Codebase Chat CLI - A command-line interface for interacting with codebases using local LLMs via Ollama.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Codebase Chat CLI - GPU Accelerated | |
A command-line interface for interacting with codebases using local LLMs via Ollama. | |
Supports GPU acceleration for improved performance and ChromaDB for vector indexing. | |
Features: | |
- Vector index creation of source code files with ChromaDB and Ollama embeddings | |
- .codechatignore support for excluding files/folders | |
- Interactive querying of indexed codebases | |
- GPU and Apple Silicon acceleration (CUDA/MPS) for embeddings and chat | |
- Project management capabilities (indexing, analysis, listing) | |
- Multi-language support (Java, Kotlin, Python, JS, TS, Go, Rust, C++, etc.) | |
- Dry-run mode for previewing indexing operations | |
Environment Variables: | |
- OLLAMA_MODEL: Default chat model (e.g., "phi4:14b") | |
- OLLAMA_EMBED_MODEL: Embedding model (e.g., "nomic-embed-text") | |
- OLLAMA_URL: Ollama API endpoint (default: http://localhost:11434) | |
- INDEX_ROOT: Root directory for storing vector indexes | |
""" | |
import os | |
import sys | |
import argparse | |
import shutil | |
import time | |
import re | |
import chromadb | |
import torch | |
from pathlib import Path | |
from typing import Optional, List, Dict, Any | |
from dotenv import load_dotenv | |
from pathspec import PathSpec | |
from packaging import version | |
# Enhanced LlamaIndex imports | |
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, ServiceContext | |
from llama_index.core.settings import Settings | |
from llama_index.core.node_parser import TokenTextSplitter | |
from llama_index.core.prompts import PromptTemplate | |
from llama_index.embeddings.ollama import OllamaEmbedding | |
from llama_index.vector_stores.chroma import ChromaVectorStore | |
from llama_index.llms.ollama import Ollama | |
# --- Configuration --- | |
load_dotenv() | |
DEFAULT_MODEL = os.getenv("OLLAMA_MODEL", "phi4:14b") | |
#DEFAULT_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "nomic-embed-text") | |
DEFAULT_EMBED_MODEL = os.getenv("OLLAMA_EMBED_MODEL", "all-minilm") | |
DEFAULT_OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434") | |
INDEX_ROOT = os.getenv("INDEX_ROOT", os.path.expanduser("~/.codechat/indexes")) | |
ALLOWED_EXTENSIONS = {".java", ".kt", ".py", ".js", ".ts", ".go", ".rs", ".cpp", ".h", ".xml", ".properties", ".yml", | |
".md"} | |
# Quality Improvement Defaults | |
DEFAULT_CHUNK_SIZE = 512 | |
DEFAULT_CHUNK_OVERLAP = 128 | |
DEFAULT_SIMILARITY_TOP_K = 3 | |
# Enhanced QA Prompt | |
CODE_QA_PROMPT = PromptTemplate(""" | |
You are a senior developer analyzing this codebase. Provide: | |
1. Concise technical explanation | |
2. Relevant code snippets with source file references | |
3. Usage examples when appropriate | |
4. Any potential issues or caveats | |
Format your response in markdown with proper code blocks. | |
Context: {context_str} | |
Question: {query_str} | |
Answer: | |
""") | |
# Timeout settings (seconds) | |
DEFAULT_TIMEOUT = 60 | |
MAX_RETRIES = 2 | |
# Minimum required versions | |
MIN_CHROMADB_VERSION = "0.4.0" | |
MIN_TORCH_VERSION = "1.10.0" | |
def validate_project_name(name: str) -> bool: | |
""" | |
Validates a project name to ensure it is safe for use as a filesystem directory name. | |
Args: | |
name (str): The project name to validate. | |
Returns: | |
bool: True if the name is valid (contains only letters, numbers, underscores, or hyphens), False otherwise. | |
""" | |
if not name: | |
return False | |
return bool(re.match(r'^[a-zA-Z0-9_-]+$', name)) | |
def check_dependencies() -> None: | |
""" | |
Checks the versions of required dependencies and prints warnings if they | |
do not meet the minimum required versions. | |
""" | |
try: | |
chroma_version = version.parse(chromadb.__version__) | |
if chroma_version < version.parse(MIN_CHROMADB_VERSION): | |
print(f"⚠️ ChromaDB version {chromadb.__version__} is below minimum required {MIN_CHROMADB_VERSION}") | |
torch_version = version.parse(torch.__version__) | |
if torch_version < version.parse(MIN_TORCH_VERSION): | |
print(f"⚠️ PyTorch version {torch.__version__} is below minimum required {MIN_TORCH_VERSION}") | |
except Exception as e: | |
print(f"⚠️ Could not verify dependency versions: {str(e)}") | |
def get_device(force_cpu: bool = False) -> str: | |
""" | |
Determines the most suitable compute device for processing. | |
Args: | |
force_cpu (bool): If True, always return 'cpu' regardless of available hardware. | |
Returns: | |
str: The device to use ('cuda', 'mps', or 'cpu'). | |
""" | |
if not force_cpu and torch.cuda.is_available(): | |
return "cuda" | |
elif not force_cpu and torch.backends.mps.is_available(): | |
return "mps" # Apple Silicon | |
return "cpu" | |
def should_index_file(path: Path) -> bool: | |
""" | |
Checks whether a given file should be indexed based on its file extension. | |
Args: | |
path (Path): The file path to check. | |
Returns: | |
bool: True if the file extension is supported; False otherwise. | |
""" | |
return path.suffix.lower() in ALLOWED_EXTENSIONS | |
def gather_files( | |
codebase_path: Path, | |
verbose: bool = False, | |
ignore_file_path: Optional[Path] = None | |
) -> List[str]: | |
""" | |
Recursively collects file paths from a codebase directory, applying .codechatignore patterns if present. | |
Args: | |
codebase_path (Path): Root directory of the codebase. | |
verbose (bool, optional): Enables detailed output during file collection. Defaults to False. | |
ignore_file_path (Optional[Path], optional): Custom path to a .codechatignore file. | |
If None, looks for .codechatignore in default locations. Defaults to None. | |
Returns: | |
List[str]: A list of string paths to source files eligible for indexing. | |
""" | |
# Look for ignore files in priority order | |
possible_ignore_files = [] | |
if ignore_file_path: | |
possible_ignore_files.append(ignore_file_path) | |
possible_ignore_files.extend([ | |
Path.cwd() / ".codechatignore", | |
codebase_path / ".codechatignore" | |
]) | |
spec = None | |
for ignore_file in possible_ignore_files: | |
if ignore_file.exists(): | |
if verbose: | |
print(f"🔍 Found .codechatignore at {ignore_file}") | |
with ignore_file.open("r", encoding="utf-8") as f: | |
patterns = [line.strip() for line in f if line.strip() and not line.startswith("#")] | |
if verbose and patterns: | |
print(f"📜 Ignore patterns: {patterns}") | |
spec = PathSpec.from_lines("gitwildmatch", patterns) | |
break | |
files = [] | |
for p in codebase_path.rglob("*"): | |
if not p.is_file(): | |
continue | |
if not should_index_file(p): | |
if verbose: | |
print(f"➖ Skipping (extension): {p}") | |
continue | |
try: | |
rel_path = p.relative_to(codebase_path).as_posix() | |
if verbose: | |
print(f"🔄 Testing path: {rel_path}") | |
except ValueError: | |
if verbose: | |
print(f"⚠️ Path error: {p}") | |
continue | |
if spec and spec.match_file(rel_path): | |
if verbose: | |
print(f"🚫 Excluded by pattern: {rel_path}") | |
continue | |
files.append(str(p)) | |
if verbose: | |
print(f"✅ Added: {p}") | |
return files | |
def verify_metadata(index: VectorStoreIndex) -> bool: | |
""" | |
Verifies that metadata is present for each node in the index. | |
Args: | |
index (VectorStoreIndex): VectorStoreIndex instance to verify. | |
Returns: | |
bool: True if all nodes contain source metadata; False otherwise. | |
""" | |
for node_id, node in index.docstore.docs.items(): | |
if not node.metadata.get('source_file'): | |
print(f"⚠️ Missing source_file in node {node_id}") | |
return False | |
return True | |
def build_index( | |
project: str, | |
codebase_path: Path, | |
embed_model: str, | |
device: str, | |
clean: bool = False, | |
dry_run: bool = False, | |
verbose: bool = False, | |
ignore_file_path: Optional[Path] = None, | |
chunk_size: int = DEFAULT_CHUNK_SIZE, | |
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP | |
) -> None: | |
""" | |
Builds a ChromaDB-based vector index for the specified project. | |
Args: | |
project (str): Project name for indexing. | |
codebase_path (Path): Path to the source code directory. | |
embed_model (str): The embedding model name for document vectorization. | |
device (str): Compute device identifier (e.g., 'cuda', 'cpu', 'mps'). | |
clean (bool, optional): If True, deletes and rebuilds the index. Defaults to False. | |
dry_run (bool, optional): If True, only simulates the indexing process. Defaults to False. | |
verbose (bool, optional): Enables debug output. Defaults to False. | |
ignore_file_path (Optional[Path], optional): Custom .codechatignore path. Defaults to None. | |
chunk_size (int, optional): Maximum token chunk size for embedding. Defaults to DEFAULT_CHUNK_SIZE. | |
chunk_overlap (int, optional): Overlap between chunks. Defaults to DEFAULT_CHUNK_OVERLAP. | |
Raises: | |
SystemExit: If no indexable files are found. | |
""" | |
project_index_path = Path(INDEX_ROOT) / project | |
if clean: | |
shutil.rmtree(project_index_path, ignore_errors=True) | |
os.makedirs(project_index_path, exist_ok=True) | |
indexed_files = gather_files(codebase_path, verbose, ignore_file_path) | |
if not indexed_files: | |
print("❌ No indexable files found.") | |
sys.exit(1) | |
if dry_run: | |
print(f"✅ Dry run complete (would index {len(indexed_files)} files)") | |
return | |
# Document processing | |
node_parser = TokenTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
separator="\n" | |
) | |
documents = SimpleDirectoryReader( | |
input_files=indexed_files, | |
file_metadata=lambda x: { | |
'source_file': str(Path(x).absolute()), | |
'file_path': str(Path(x).relative_to(codebase_path)), | |
'file_name': Path(x).name, | |
'timestamp': time.time() | |
} | |
).load_data() | |
# Configure Settings instead of ServiceContext | |
Settings.llm = Ollama(model=DEFAULT_MODEL, base_url=DEFAULT_OLLAMA_URL) | |
Settings.embed_model = OllamaEmbedding(model_name=embed_model, device=device) | |
Settings.node_parser = node_parser | |
Settings.chunk_size = chunk_size | |
Settings.chunk_overlap = chunk_overlap | |
# Create index | |
chroma_client = chromadb.PersistentClient(path=str(project_index_path)) | |
vector_store = ChromaVectorStore(chroma_collection=chroma_client.get_or_create_collection(f"{project}_collection")) | |
index = VectorStoreIndex.from_documents( | |
documents, | |
storage_context=StorageContext.from_defaults(vector_store=vector_store), | |
show_progress=verbose | |
) | |
index.storage_context.persist() | |
if not verify_metadata(index): | |
print("❌ Metadata issues detected - some sources may show as Unknown") | |
print(f"\n✅ Index built with {len(indexed_files)} files (chunk size: {chunk_size}, overlap: {chunk_overlap})") | |
def chat( | |
project: str, | |
model: str, | |
embed_model: str, | |
temperature: float, | |
num_ctx: int, | |
top_p: float, | |
repeat_penalty: float, | |
device: str, | |
verbose: bool = False, | |
similarity_top_k: int = DEFAULT_SIMILARITY_TOP_K, | |
timeout: int = DEFAULT_TIMEOUT, | |
max_retries: int = MAX_RETRIES, | |
chunk_size: int = DEFAULT_CHUNK_SIZE, | |
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP | |
) -> None: | |
""" | |
Start an interactive chat session with an indexed codebase. | |
Args: | |
project (str): Name of the project to chat with. | |
model (str): Ollama model name to use for chat. | |
embed_model (str): Ollama embedding model name. | |
temperature (float): Controls randomness of responses (0.0-1.0). | |
num_ctx (int): Context window size in tokens. | |
top_p (float): Top-p sampling parameter. | |
repeat_penalty (float): Penalty for repeated tokens. | |
device (str): Compute device to use ('cuda', 'mps', or 'cpu'). | |
verbose (bool, optional): If True, prints detailed progress information. Defaults to False. | |
similarity_top_k (int, optional): Number of similar chunks to retrieve. Defaults to DEFAULT_SIMILARITY_TOP_K. | |
timeout (int, optional): Query timeout in seconds. Defaults to DEFAULT_TIMEOUT. | |
max_retries (int, optional): Number of retry attempts on timeout. Defaults to MAX_RETRIES. | |
chunk_size (int, optional): Text chunk size for processing. Defaults to DEFAULT_CHUNK_SIZE. | |
chunk_overlap (int, optional): Context overlap between chunks. Defaults to DEFAULT_CHUNK_OVERLAP. | |
Raises: | |
SystemExit: If no index is found for the specified project. | |
""" | |
project_index_path = Path(INDEX_ROOT) / project | |
if not project_index_path.exists(): | |
print(f"❌ No index found for '{project}'. Run with --reindex first.") | |
sys.exit(1) | |
# Configure for quality responses | |
Settings.embed_model = OllamaEmbedding( | |
model_name=embed_model, | |
base_url=DEFAULT_OLLAMA_URL, | |
device=device | |
) | |
Settings.llm = Ollama( | |
model=model, | |
base_url=DEFAULT_OLLAMA_URL, | |
temperature=temperature, | |
num_ctx=num_ctx, | |
top_p=top_p, | |
repeat_penalty=repeat_penalty, | |
device=device, | |
request_timeout=timeout | |
) | |
# Quality-optimized query engine | |
chroma_client = chromadb.PersistentClient(path=str(project_index_path)) | |
vector_store = ChromaVectorStore(chroma_collection=chroma_client.get_collection(f"{project}_collection")) | |
index = VectorStoreIndex.from_vector_store(vector_store) | |
query_engine = index.as_query_engine( | |
similarity_top_k=similarity_top_k, | |
include_metadata=True, | |
metadata_fields=['source_file', 'file_name', 'file_path'], | |
vector_store_query_mode="hybrid", | |
response_mode="tree_summarize", | |
text_qa_template=CODE_QA_PROMPT, | |
verbose=verbose, | |
timeout=timeout, | |
retry_on_timeout=True, | |
max_retries=max_retries | |
) | |
sample_embedding = Settings.embed_model.get_text_embedding("sample code class") | |
print(f"Embedding dimension: {len(sample_embedding)}") | |
def debug_index_metadata(index: VectorStoreIndex, verbose: bool = True) -> None: | |
"""Debug function to check what metadata exists in the index. | |
Args: | |
index (VectorStoreIndex): The index to debug. | |
verbose (bool, optional): If True, prints detailed information. Defaults to True. | |
""" | |
if not verbose: | |
return | |
print("\n🔍 Debugging index metadata:") | |
try: | |
collection = index._vector_store._collection | |
print(f"Collection name: {collection.name}") | |
print(f"Total vectors: {collection.count()}") | |
# Get sample items with metadata | |
items = collection.get(limit=3, include=["metadatas", "documents"]) | |
if items and "metadatas" in items: | |
print("\nSample metadata found:") | |
for i, (meta, doc) in enumerate(zip(items["metadatas"], items["documents"][:3])): | |
print(f" {i + 1}. Metadata: {meta}") | |
print(f" First 50 chars: {doc[:50]}...\n") | |
else: | |
print("⚠️ No metadata found in collection") | |
except Exception as e: | |
print(f"⚠️ Error checking metadata: {str(e)}") | |
print("Trying alternative access method...") | |
try: | |
# Alternative way to check nodes | |
nodes = index.docstore.docs | |
print(f"\nFound {len(nodes)} nodes in docstore") | |
for node_id, node in list(nodes.items())[:3]: | |
print(f"Node {node_id}:") | |
print(f" Metadata: {node.metadata}") | |
print(f" Text: {node.text[:50]}...\n") | |
except Exception as e2: | |
print(f"⚠️ Couldn't access docstore either: {str(e2)}") | |
print("\n🔎 Verifying index structure...") | |
debug_index_metadata(index, verbose=True) | |
# Additional verification | |
print("\n🔍 Index Verification:") | |
try: | |
print(f"- Vectors: {index._vector_store._collection.count()}") | |
if hasattr(index, 'docstore'): | |
print(f"- Documents: {len(index.docstore.docs)}") | |
else: | |
print("- Docstore: Not available (normal for ChromaDB)") | |
except Exception as e: | |
print(f"⚠️ Verification note: {str(e)}") | |
debug_index_metadata(index, verbose=True) | |
# Response enhancement functions | |
def enhance_query(query: str) -> str: | |
"""Add context based on query type to get better responses. | |
Args: | |
query (str): The original user query. | |
Returns: | |
str: The enhanced query with additional context. | |
""" | |
query = query.strip() | |
lower_query = query.lower() | |
# Module/package queries | |
if "module" in query or "package" in query: | |
return ("List all Java modules/packages with their relative paths, " | |
"main classes, and 1-2 sentence descriptions. " | |
"Include the module's purpose and key features.") | |
# Explanation queries | |
elif any(q_word in lower_query for q_word in ["how", "why", "explain"]): | |
return f"{query} (provide detailed explanation with code references)" | |
# Example queries | |
elif "example" in lower_query: | |
return f"{query} (include practical usage examples)" | |
# Default case - return original query | |
return query | |
def format_response(response: Any) -> str: | |
"""Formats the response with source references. | |
Args: | |
response (Any): The query response object. | |
Returns: | |
str: The formatted response text with sources. | |
""" | |
text = response.response | |
# Source nodes handling | |
if hasattr(response, 'source_nodes') and response.source_nodes: | |
sources = [] | |
for node in response.source_nodes[:3]: # Show top 3 sources | |
source = node.metadata.get('source_file') or node.metadata.get('file_path', 'Unknown') | |
if source != 'Unknown': | |
try: | |
# First try making it relative to INDEX_ROOT | |
source = str(Path(source).relative_to(INDEX_ROOT)) | |
except ValueError: | |
try: | |
# If that fails, just show the filename | |
source = Path(source).name | |
except: | |
source = "Unknown path" | |
sources.append(f"- {source} (score: {node.score:.2f})") | |
text += "\n\n🔍 Sources:\n" + "\n".join(sources) | |
return text | |
# Interactive chat loop | |
print(f"\n💬 Chatting with {project} (Enhanced Mode)") | |
print("Type 'exit' or press Ctrl+C to quit\n") | |
# Show optimization tips if settings might cause performance issues | |
optimization_params = { | |
'timeout': timeout, | |
'chunk_size': chunk_size, | |
'chunk_overlap': chunk_overlap, | |
'similarity_top_k': similarity_top_k, | |
'model': model | |
} | |
tips = get_optimization_tips(optimization_params) | |
if tips: | |
print("\n💡 Performance Tips:") | |
for tip in tips: | |
print(f" - {tip}") | |
print() | |
while True: | |
try: | |
question = input("🤖 > ").strip() | |
if question.lower() in {"exit", "quit"}: | |
break | |
start_time = time.time() | |
try: | |
response = query_engine.query(enhance_query(question)) | |
print(f"\n{format_response(response)}") | |
# DEBUG: Show raw source nodes | |
if hasattr(response, 'source_nodes'): | |
print("\n🔍 DEBUG - Source Nodes:") | |
for i, node in enumerate(response.source_nodes[:3]): | |
print(f"Node {i + 1}:") | |
print(f" Score: {node.score}") | |
try: | |
print(f" Path: {node.metadata.get('file_path')}") | |
print(f" Source: {node.metadata.get('source_file')}") | |
except Exception as e: | |
print(f" Metadata error: {str(e)}") | |
print(f" Text: {node.text[:100]}...") | |
except Exception as e: | |
if "timeout" in str(e).lower(): | |
print("\n⏱️ The query timed out. Try:") | |
print("- Asking a more specific question") | |
print(f"- Increasing timeout (current: {timeout}s)") | |
print(f"- Reducing chunk size (current: {chunk_size})") | |
else: | |
print(f"\n❌ Query Error: {str(e)}") | |
print(f"\n⏱️ Response time: {time.time() - start_time:.2f}s") | |
except KeyboardInterrupt: | |
print("\n👋 Exiting...") | |
break | |
def list_projects(verbose: bool = False) -> None: | |
""" | |
Display all indexed projects with accurate status. | |
Args: | |
verbose (bool, optional): If True, shows additional details about each project. Defaults to False. | |
""" | |
index_root_path = Path(INDEX_ROOT) | |
if not index_root_path.exists(): | |
print("No projects indexed yet.") | |
return | |
print("📂 Indexed Projects:") | |
for project_dir in sorted(index_root_path.iterdir()): | |
if project_dir.is_dir(): | |
status = "❌" | |
size_info = "unknown" | |
try: | |
client = chromadb.PersistentClient(path=str(project_dir)) | |
collections = client.list_collections() | |
if collections: | |
# Find matching collection | |
for col in collections: | |
if col.name == project_dir.name or col.name == f"{project_dir.name}_collection": | |
count = col.count() | |
size_info = f"{count} vectors" | |
status = "✅" | |
break | |
except Exception as e: | |
if verbose: | |
print(f"⚠️ Error checking {project_dir.name}: {str(e)}") | |
print(f" - {project_dir.name} {status} ({size_info})") | |
def show_config(args: argparse.Namespace) -> None: | |
""" | |
Display the current configuration including hardware and model settings. | |
Args: | |
args (argparse.Namespace): Parsed command-line arguments. | |
""" | |
device = get_device(force_cpu=args.cpu) | |
gpu_type = "None" | |
if device == "cuda": | |
gpu_type = torch.cuda.get_device_name(0) | |
elif device == "mps": | |
gpu_type = "Apple Silicon (MPS)" | |
print("⚙️ Current Configuration:") | |
print(f" Project: {args.project if hasattr(args, 'project') else 'N/A'}") | |
print(f" Model: {args.model}") | |
print(f" Embed Model: {args.embed_model}") | |
print(f" Device: {device.upper()} ({gpu_type})") | |
print(f" Temperature: {args.temperature}") | |
print(f" Context Window: {args.num_ctx} tokens") | |
print("\n🛠️ Paths:") | |
print(f" Index Root: {INDEX_ROOT}") | |
print(f" Ollama URL: {DEFAULT_OLLAMA_URL}") | |
# Show ignore file info if available | |
ignore_locations = [ | |
Path(args.ignore_file) if hasattr(args, 'ignore_file') and args.ignore_file else None, | |
Path.cwd() / ".codechatignore", | |
Path(args.reindex) / ".codechatignore" if hasattr(args, 'reindex') and args.reindex else None | |
] | |
found = False | |
for loc in ignore_locations: | |
if loc and loc.exists(): | |
print(f"\n🔍 Active .codechatignore at: {loc}") | |
with open(loc, 'r') as f: | |
print(" Ignore Patterns:") | |
for line in f: | |
line = line.strip() | |
if line and not line.startswith("#"): | |
print(f" - {line}") | |
found = True | |
break | |
if not found: | |
print("\n⚠️ No .codechatignore file found") | |
def analyze_project(project: str, verbose: bool = False) -> None: | |
""" | |
Display detailed analytics about an indexed project. | |
Args: | |
project (str): Name of the project to analyze. | |
verbose (bool, optional): If True, shows additional storage details. Defaults to False. | |
Raises: | |
None: This function handles errors gracefully and prints messages instead of raising exceptions. | |
""" | |
project_path = Path(INDEX_ROOT) / project | |
if not project_path.exists(): | |
print(f"❌ Project '{project}' not found") | |
return | |
print(f"\n📊 Analysis for '{project}':") | |
print("─" * 50) | |
# 1. Enhanced ChromaDB Stats | |
try: | |
client = chromadb.PersistentClient(path=str(project_path)) | |
collection = client.get_collection(f"{project}_collection") | |
# Count vectors and their distribution | |
count = collection.count() | |
metadata = collection.get(include=["metadatas"]) | |
file_types = {} | |
file_sizes = {} | |
if metadata and "metadatas" in metadata: | |
for item in metadata["metadatas"]: | |
if item and isinstance(item, dict) and "file_path" in item: | |
try: | |
ext = Path(item["file_path"]).suffix.lower() | |
file_types[ext] = file_types.get(ext, 0) + 1 | |
# Get file size if available | |
if "file_size" in item: | |
file_sizes[ext] = file_sizes.get(ext, 0) + int(item["file_size"]) | |
except (TypeError, AttributeError) as e: | |
if verbose: | |
print(f"⚠️ Could not process metadata item: {str(e)}") | |
continue | |
print("\n📈 Embedding Statistics:") | |
print(f" - Total vectors: {count}") | |
if file_types: | |
print(" - File type distribution:") | |
for ext, num in sorted(file_types.items(), key=lambda x: x[1], reverse=True): | |
size_info = "" | |
if ext in file_sizes: | |
size_info = f" ({file_sizes[ext] / 1024:.1f} KB total)" | |
print(f" - {ext if ext else 'no-extension'}: {num} vectors{size_info}") | |
except Exception as e: | |
print(f"⚠️ Couldn't read ChromaDB collection: {str(e)}") | |
if "truth value of an array" in str(e): | |
print("💡 Try upgrading ChromaDB: pip install --upgrade chromadb numpy") | |
# 2. Storage Analysis | |
try: | |
total_size = sum(f.stat().st_size for f in project_path.glob('**/*') if f.is_file()) | |
print("\n💾 Storage Usage:") | |
print(f" - Index size: {total_size / 1024 / 1024:.2f} MB") | |
print(f" - Files: {len(list(project_path.glob('**/*')))}") | |
if verbose: | |
print("\n🔍 Detailed Storage Breakdown:") | |
for item in project_path.iterdir(): | |
if item.is_file(): | |
print(f" - {item.name}: {item.stat().st_size / 1024:.1f} KB") | |
elif item.is_dir(): | |
dir_size = sum(f.stat().st_size for f in item.glob('**/*') if f.is_file()) | |
print(f" - {item.name}/: {dir_size / 1024:.1f} KB") | |
except Exception as e: | |
print(f"⚠️ Couldn't analyze storage: {str(e)}") | |
# 3. Health Check - Updated for ChromaDB v0.4+ format | |
print("\n🩺 Health Check:") | |
healthy = True | |
# Required files for ChromaDB v0.4+ | |
required_files = { | |
"chroma.sqlite3": "SQLite database", | |
} | |
# Optional files | |
optional_files = { | |
"chroma_settings.json": "Settings file", | |
"chroma-embeddings.parquet": "Embeddings data (legacy)" | |
} | |
# Check required files | |
for file, desc in required_files.items(): | |
if (project_path / file).exists(): | |
print(f" - ✅ {desc} present") | |
else: | |
print(f" - ❌ {desc} missing!") | |
healthy = False | |
# Check optional files | |
for file, desc in optional_files.items(): | |
if (project_path / file).exists(): | |
print(f" - ☑️ {desc} present") | |
else: | |
print(f" - ⚠️ {desc} not found (optional)") | |
# Check collection exists and is accessible | |
try: | |
client = chromadb.PersistentClient(path=str(project_path)) | |
collection = client.get_collection(f"{project}_collection") | |
print(f" - ✅ Collection accessible ({collection.count()} vectors)") | |
except Exception as e: | |
print(f" - ❌ Collection error: {str(e)}") | |
healthy = False | |
print(f"\n{'✅ Index is healthy' if healthy else '❌ Index has issues!'}") | |
print("─" * 50) | |
def repair_project(project: str, verbose: bool = False) -> None: | |
""" | |
Attempt to repair a potentially corrupted index. | |
Args: | |
project (str): Name of the project to repair. | |
verbose (bool, optional): If True, shows additional repair details. Defaults to False. | |
""" | |
project_path = Path(INDEX_ROOT) / project | |
if not project_path.exists(): | |
print(f"❌ Project directory '{project}' not found") | |
return | |
print(f"\n🔧 Repairing project '{project}'...") | |
try: | |
client = chromadb.PersistentClient(path=str(project_path)) | |
# ChromaDB uses different collection naming in newer versions | |
collections = client.list_collections() | |
if not collections: | |
raise ValueError("No collections found in project directory") | |
# Try both naming conventions | |
collection_name = None | |
for col in collections: | |
if col.name == project or col.name == f"{project}_collection": | |
collection_name = col.name | |
break | |
if not collection_name: | |
raise ValueError(f"No matching collection found (tried: '{project}', '{project}_collection')") | |
if verbose: | |
print(f"🔄 Found collection: {collection_name}") | |
collection = client.get_collection(collection_name) | |
count = collection.count() | |
print(f"\n✅ Repair successful - project is healthy") | |
print(f" Collection: {collection_name}") | |
print(f" Total vectors: {count}") | |
except Exception as e: | |
print(f"\n❌ Repair failed: {str(e)}") | |
print("\nRecommended solutions:") | |
print(f"1. Clean reindex: --project {project} --reindex /path/to/code --clean") | |
print(f"2. Manual repair steps:") | |
print(f" - Delete directory: {project_path}") | |
print(f" - Check collection name in: {project_path}/chroma.sqlite3") | |
def get_optimization_tips(params: Dict[str, Any]) -> List[str]: | |
"""Generate performance optimization suggestions based on current parameters. | |
Args: | |
params (Dict[str, Any]): Dictionary of current configuration parameters. | |
Returns: | |
List[str]: List of optimization tips. | |
""" | |
tips = [] | |
# Timeout-related tips | |
if params['timeout'] < 30: | |
tips.append(f"Increase timeout (current: {params['timeout']}s)") | |
# Chunking-related tips | |
if params['chunk_size'] > 768: | |
tips.append(f"Reduce chunk size (current: {params['chunk_size']})") | |
if params['chunk_overlap'] > 128: | |
tips.append(f"Reduce chunk overlap (current: {params['chunk_overlap']})") | |
# Retrieval-related tips | |
if params['similarity_top_k'] > 3: | |
tips.append(f"Reduce retrieved chunks (current: {params['similarity_top_k']})") | |
# Model-related tips | |
if "34b" in params['model'] or "70b" in params['model']: | |
tips.append(f"Try smaller model (current: {params['model']})") | |
return tips | |
def main(): | |
"""Entry point for the Codebase Chat CLI application. | |
Handles command-line arguments and orchestrates the main application flow including: | |
- Dependency checks | |
- Project management (listing, analyzing, repairing) | |
- Indexing operations | |
- Chat functionality | |
Command Line Arguments: | |
--project PROJECT_NAME : Specifies project to operate on (for chat/reindex/repair) | |
--list-projects : Lists all indexed projects | |
--show-config : Displays current configuration | |
--repair PROJECT : Attempts to repair a corrupted index | |
--reindex PATH : Path to codebase to index | |
--analyze : Shows detailed project analysis | |
--model MODEL_NAME : Specifies Ollama model to use (default: DEFAULT_MODEL) | |
--embed-model EMBED_MODEL : Specifies Ollama embedding model (default: DEFAULT_EMBED_MODEL) | |
--cpu : Forces CPU mode | |
--gpu : Forces GPU mode if available | |
--temperature FLOAT : Sets model temperature (default: 0.0) | |
--num-ctx INT : Sets context window size (default: 8192) | |
--top-p FLOAT : Sets top-p sampling value (default: 1.0) | |
--repeat-penalty FLOAT : Sets repetition penalty (default: 1.0) | |
--clean : Deletes and recreates the index | |
--dry-run : Only lists files to be indexed | |
--verbose : Shows detailed debug output | |
--ignore-file PATH : Path to custom .codechatignore file | |
--chunk-size INT : Text chunk size for processing (default: DEFAULT_CHUNK_SIZE) | |
--chunk-overlap INT : Context overlap between chunks (default: DEFAULT_CHUNK_OVERLAP) | |
--similarity-top-k INT : Number of similar chunks to retrieve (default: DEFAULT_SIMILARITY_TOP_K) | |
--timeout INT : Query timeout in seconds (default: DEFAULT_TIMEOUT) | |
--max-retries INT : Number of retry attempts on timeout (default: MAX_RETRIES) | |
""" | |
# Check dependencies first | |
check_dependencies() | |
parser = argparse.ArgumentParser( | |
description="Quality-Enhanced Codebase Chat CLI", | |
formatter_class=argparse.ArgumentDefaultsHelpFormatter | |
) | |
# Create mutually exclusive group for main actions | |
action_group = parser.add_mutually_exclusive_group(required=True) | |
action_group.add_argument("--project", help="Project name (for chat/reindex/repair)") | |
action_group.add_argument("--list-projects", action="store_true", | |
help="List all indexed projects") | |
action_group.add_argument("--show-config", action="store_true", | |
help="Show current configuration") | |
action_group.add_argument("--repair", metavar="PROJECT", | |
help="Attempt to repair a corrupted index") | |
# Project-specific arguments | |
parser.add_argument("--reindex", metavar="PATH", help="Path to codebase to index") | |
parser.add_argument("--analyze", action="store_true", | |
help="Show detailed project analysis") | |
# Model settings | |
parser.add_argument("--model", default=DEFAULT_MODEL, help="Ollama model name") | |
parser.add_argument("--embed-model", default=DEFAULT_EMBED_MODEL, | |
help=f"Ollama embedding model (default: {DEFAULT_EMBED_MODEL})") | |
# Hardware control | |
parser.add_argument("--cpu", action="store_true", help="Force CPU mode") | |
parser.add_argument("--gpu", action="store_true", help="Force GPU mode if available") | |
# Performance tuning | |
parser.add_argument("--temperature", type=float, | |
default=float(os.getenv("OLLAMA_TEMPERATURE", 0.0)), | |
help="Model temperature") | |
parser.add_argument("--num-ctx", type=int, | |
default=int(os.getenv("OLLAMA_NUM_CTX", 8192)), | |
help="Context window size") | |
parser.add_argument("--top-p", type=float, | |
default=float(os.getenv("OLLAMA_TOP_P", 1.0)), | |
help="Top-p sampling") | |
parser.add_argument("--repeat-penalty", type=float, | |
default=float(os.getenv("OLLAMA_REPEAT_PENALTY", 1.0)), | |
help="Repetition penalty") | |
# Utility flags | |
parser.add_argument("--clean", action="store_true", | |
help="Delete and recreate the index") | |
parser.add_argument("--dry-run", action="store_true", | |
help="Only list files to be indexed") | |
parser.add_argument("--verbose", action="store_true", | |
help="Show detailed debug output") | |
parser.add_argument("--ignore-file", | |
help="Path to custom .codechatignore file") | |
# Add quality parameters | |
parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, | |
help="Text chunk size for processing") | |
parser.add_argument("--chunk-overlap", type=int, default=DEFAULT_CHUNK_OVERLAP, | |
help="Context overlap between chunks") | |
parser.add_argument("--similarity-top-k", type=int, default=DEFAULT_SIMILARITY_TOP_K, | |
help="Number of similar chunks to retrieve") | |
parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, | |
help="Query timeout in seconds") | |
parser.add_argument("--max-retries", type=int, default=MAX_RETRIES, | |
help="Number of retry attempts on timeout") | |
args = parser.parse_args() | |
# Handle global commands first | |
if args.list_projects: | |
list_projects(verbose=args.verbose) | |
return | |
if args.show_config: | |
show_config(args) | |
return | |
if args.repair: | |
repair_project(args.repair, verbose=args.verbose) | |
return | |
# Validate project-specific commands | |
if not hasattr(args, 'project') or not args.project: | |
print("❌ Project name is required for this action") | |
parser.print_help() | |
sys.exit(1) | |
if not validate_project_name(args.project): | |
print("❌ Invalid project name. Only alphanumeric, underscore and hyphen characters are allowed.") | |
sys.exit(1) | |
# Device selection | |
if args.gpu and args.cpu: | |
print("❌ Cannot force both GPU and CPU modes") | |
sys.exit(1) | |
device = get_device(force_cpu=args.cpu) | |
if args.gpu and device != "cuda": | |
print("⚠️ GPU requested but not available - falling back to CPU") | |
device = "cpu" | |
if args.verbose: | |
print(f"\n⚙️ Configuration:") | |
print(f" Device: {device.upper()} ({'✅ GPU' if device == 'cuda' else '⚠️ CPU'})") | |
print(f" Model: {args.model}") | |
print(f" Embed Model: {args.embed_model}") | |
if hasattr(args, 'project'): | |
print(f" Project: {args.project}") | |
print(f" Index Location: {Path(INDEX_ROOT) / args.project}\n") | |
# Handle project actions | |
if args.analyze: | |
analyze_project(args.project, args.verbose) | |
elif args.reindex: | |
build_index( | |
project=args.project, | |
codebase_path=Path(args.reindex), | |
embed_model=args.embed_model, | |
device=device, | |
clean=args.clean, | |
dry_run=args.dry_run, | |
verbose=args.verbose, | |
ignore_file_path=Path(args.ignore_file) if args.ignore_file else None, | |
chunk_size=args.chunk_size, | |
chunk_overlap=args.chunk_overlap | |
) | |
else: | |
chat( | |
project=args.project, | |
model=args.model, | |
embed_model=args.embed_model, | |
temperature=args.temperature, | |
num_ctx=args.num_ctx, | |
top_p=args.top_p, | |
repeat_penalty=args.repeat_penalty, | |
device=device, | |
verbose=args.verbose, | |
similarity_top_k=args.similarity_top_k, | |
timeout=args.timeout, | |
max_retries=args.max_retries, | |
chunk_size=args.chunk_size, | |
chunk_overlap=args.chunk_overlap | |
) | |
if __name__ == "__main__": | |
main() |
Author
somera
commented
Apr 5, 2025
- https://gist.github.com/somera/41a3fc3a7343fef27c48e882da7b328c
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment