Last active
July 1, 2025 17:07
-
-
Save CliffordAnderson/fa737d6962aa3589089060dd41f97699 to your computer and use it in GitHub Desktop.
women-in-religion-llamaindex.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"private_outputs": true, | |
"provenance": [], | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/CliffordAnderson/fa737d6962aa3589089060dd41f97699/women-in-religion-llamaindex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "QKH3WMzX8MqI" | |
}, | |
"source": [ | |
"## 1. Installation of Required Libraries\n", | |
"\n", | |
"First, we'll install the core LlamaIndex libraries needed for document indexing and querying." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "RGB4dyj0JAXQ" | |
}, | |
"outputs": [], | |
"source": [ | |
"# Install core LlamaIndex libraries for document processing and indexing\n", | |
"%pip install -U llama-index llama-index-core\n", | |
"\n", | |
"# Install OpenRouter integration for accessing various LLMs through a unified API\n", | |
"%pip install llama-index-llms-openrouter" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "b8f31f44" | |
}, | |
"source": [ | |
"# Install embedding libraries for converting text to vector representations\n", | |
"%pip install sentence-transformers # Required for local embedding models\n", | |
"%pip install llama-index-embeddings-huggingface # HuggingFace embedding integration" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "LAs7uflK8MqM" | |
}, | |
"source": [ | |
"## 2. Google Drive Setup\n", | |
"\n", | |
"Mount Google Drive to access the document collection stored in the \"Women in Religion AI\" folder." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from google.colab import drive\n", | |
"\n", | |
"# Mount Google Drive to access files stored in the cloud\n", | |
"drive.mount(\"/content/drive/\")\n", | |
"\n", | |
"# Navigate to the specific folder containing our document collection\n", | |
"%cd \"/content/drive/My Drive/Women in Religion AI\"" | |
], | |
"metadata": { | |
"id": "-8nwoGx8Vf34" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import os\n", | |
"\n", | |
"# Get current working directory and list available files\n", | |
"cwd = os.getcwd()\n", | |
"files = os.listdir(cwd)\n", | |
"\n", | |
"print(\"Available files in the Women in Religion AI folder:\")\n", | |
"for file in files:\n", | |
" print(f\" - {file}\")" | |
], | |
"metadata": { | |
"id": "k_UAfBDIXdFb" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "qwLvjrUj8MqN" | |
}, | |
"source": [ | |
"## 3. API Configuration\n", | |
"\n", | |
"Set up the OpenRouter API key to access language models. OpenRouter provides a unified interface to multiple LLM providers." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import os\n", | |
"from google.colab import userdata\n", | |
"\n", | |
"# Retrieve the OpenRouter API key from Colab's secure storage\n", | |
"# This key should be stored in Colab's \"Secrets\" section for security\n", | |
"try:\n", | |
" api_key = userdata.get('OPEN_ROUTER_API')\n", | |
" if not api_key:\n", | |
" raise ValueError(\"API key is empty\")\n", | |
" os.environ['OPENROUTER_API_KEY'] = api_key\n", | |
" print(\"✓ OpenRouter API key configured successfully\")\n", | |
"except Exception as e:\n", | |
" print(\"❌ Error: Could not retrieve OpenRouter API key\")\n", | |
" print(\"Please ensure you've added 'OPEN_ROUTER_API' to Colab's Secrets section\")\n", | |
" print(\"Go to the left sidebar → 🔑 Secrets → Add new secret\")\n", | |
" raise" | |
], | |
"metadata": { | |
"id": "CDsOgr2AJPTc" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "9ygGsY3y8MqO" | |
}, | |
"source": [ | |
"## 4. Language Model Setup\n", | |
"\n", | |
"Configure the language model that will generate responses to our queries." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from llama_index.llms.openrouter import OpenRouter\n", | |
"\n", | |
"# Initialize the language model with specific parameters\n", | |
"try:\n", | |
" llm = OpenRouter(\n", | |
" api_key=os.environ['OPENROUTER_API_KEY'],\n", | |
" max_tokens=512, # Increased for more detailed responses\n", | |
" context_window=4096, # Maximum context size the model can process\n", | |
" model=\"openai/gpt-4o\", # Using GPT-4o via OpenRouter\n", | |
" temperature=0.1 # Low temperature for more consistent, factual responses\n", | |
" )\n", | |
"\n", | |
" # Test the connection with a simple query\n", | |
" test_response = llm.complete(\"Hello\")\n", | |
" print(\"✓ Language model (GPT-4o) configured and tested successfully\")\n", | |
"\n", | |
"except Exception as e:\n", | |
" print(f\"❌ Error configuring language model: {e}\")\n", | |
" print(\"Check your API key and internet connection\")\n", | |
" raise" | |
], | |
"metadata": { | |
"id": "LUUtOFZH7gGT" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "iYsrnwQg8MqP" | |
}, | |
"source": [ | |
"## 5. Embedding Model Configuration\n", | |
"\n", | |
"Set up the embedding model that will convert text into vector representations for semantic search." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n", | |
"from llama_index.core import Settings\n", | |
"\n", | |
"# Configure a lightweight, efficient embedding model from HuggingFace\n", | |
"# BGE-small is optimized for English text and provides good performance\n", | |
"Settings.embed_model = HuggingFaceEmbedding(\n", | |
" model_name=\"BAAI/bge-small-en-v1.5\"\n", | |
")\n", | |
"\n", | |
"print(\"✓ Embedding model (BGE-small-en-v1.5) configured successfully\")" | |
], | |
"metadata": { | |
"id": "gPS7EZdCRqV_" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "ol1seAmB8MqQ" | |
}, | |
"source": [ | |
"## 6. Document Loading\n", | |
"\n", | |
"Load all documents from the current directory for processing and indexing." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "0371970e-c11c-4534-aa22-7cfdfe411bb3" | |
}, | |
"outputs": [], | |
"source": [ | |
"from llama_index.core import SimpleDirectoryReader\n", | |
"\n", | |
"# Load all documents from the current directory\n", | |
"# SimpleDirectoryReader automatically handles various file formats\n", | |
"try:\n", | |
" documents = SimpleDirectoryReader(\n", | |
" input_dir=os.getcwd(),\n", | |
" exclude_hidden=True, # Skip hidden files\n", | |
" recursive=True # Include subdirectories\n", | |
" ).load_data()\n", | |
"\n", | |
" if not documents:\n", | |
" print(\"⚠️ Warning: No documents found in the current directory\")\n", | |
" print(\"Please ensure your documents are in the correct folder\")\n", | |
" else:\n", | |
" print(f\"✓ Successfully loaded {len(documents)} documents\")\n", | |
"\n", | |
" # Display document statistics\n", | |
" file_types = {}\n", | |
" total_chars = 0\n", | |
"\n", | |
" for doc in documents:\n", | |
" file_name = doc.metadata.get('file_name', 'unknown')\n", | |
" file_ext = os.path.splitext(file_name)[1].lower() or 'no extension'\n", | |
" file_types[file_ext] = file_types.get(file_ext, 0) + 1\n", | |
" total_chars += len(doc.text)\n", | |
"\n", | |
" print(f\"File types: {dict(file_types)}\")\n", | |
" print(f\"Total content: ~{total_chars:,} characters\")\n", | |
"\n", | |
"except Exception as e:\n", | |
" print(f\"❌ Error loading documents: {e}\")\n", | |
" print(\"Check that you're in the correct directory and have readable files\")\n", | |
" raise" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "k0nDfeF48MqR" | |
}, | |
"source": [ | |
"## 7. Vector Index Creation\n", | |
"\n", | |
"Create a vector store index from the loaded documents. This process:\n", | |
"1. Splits documents into chunks\n", | |
"2. Generates embeddings for each chunk\n", | |
"3. Stores vectors for fast similarity search" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from llama_index.core import VectorStoreIndex\n", | |
"from llama_index.core.node_parser import SentenceSplitter\n", | |
"import time\n", | |
"\n", | |
"# Configure text splitting for better chunking\n", | |
"text_splitter = SentenceSplitter(\n", | |
" chunk_size=512, # Smaller chunks for better precision\n", | |
" chunk_overlap=50 # Overlap to preserve context\n", | |
")\n", | |
"\n", | |
"# Create a vector store index from the loaded documents\n", | |
"print(\"Creating vector index... This may take a few minutes.\")\n", | |
"start_time = time.time()\n", | |
"\n", | |
"try:\n", | |
" index = VectorStoreIndex.from_documents(\n", | |
" documents,\n", | |
" transformations=[text_splitter], # Use custom text splitter\n", | |
" show_progress=True # Show progress during indexing\n", | |
" )\n", | |
"\n", | |
" elapsed_time = time.time() - start_time\n", | |
" print(f\"✓ Vector index created successfully in {elapsed_time:.1f} seconds\")\n", | |
" print(\"The system is now ready to answer questions about the documents!\")\n", | |
"\n", | |
"except Exception as e:\n", | |
" print(f\"❌ Error creating vector index: {e}\")\n", | |
" print(\"This might be due to memory limitations or embedding model issues\")\n", | |
" raise" | |
], | |
"metadata": { | |
"id": "v_k6Cf4xSKxu" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "pbFG5UQX8MqR" | |
}, | |
"source": [ | |
"## 8. Query Engine Setup and Testing\n", | |
"\n", | |
"Create a query engine that combines vector search with language generation to answer questions about the documents." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Create a query engine that combines retrieval and generation\n", | |
"query_engine = index.as_query_engine(\n", | |
" llm=llm, # Use our configured language model\n", | |
" similarity_top_k=5, # Retrieve top 5 most relevant document chunks\n", | |
" response_mode=\"tree_summarize\", # Better synthesis of multiple sources\n", | |
" verbose=True # Show retrieval details\n", | |
")\n", | |
"\n", | |
"# Test the system with a sample query\n", | |
"print(\"Testing the system with a sample query...\\n\")\n", | |
"\n", | |
"try:\n", | |
" response = query_engine.query(\"Who is Elizabeth Ursic?\")\n", | |
"\n", | |
" print(\"RESPONSE:\")\n", | |
" print(\"=\" * 50)\n", | |
" print(response)\n", | |
" print(\"=\" * 50)\n", | |
"\n", | |
" # Check response quality\n", | |
" if len(str(response).strip()) < 20:\n", | |
" print(\"⚠️ Warning: Response seems very short. This might indicate:\")\n", | |
" print(\" - No relevant information found in documents\")\n", | |
" print(\" - API issues or configuration problems\")\n", | |
"\n", | |
"except Exception as e:\n", | |
" print(f\"❌ Error during query: {e}\")\n", | |
" print(\"Check your API connection and try again\")\n", | |
" raise" | |
], | |
"metadata": { | |
"id": "ihPo2qMZKfqy" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "3QpXtnLW8MqS" | |
}, | |
"source": [ | |
"## 9. Source Document Inspection\n", | |
"\n", | |
"Examine which specific document sections were used to generate the response." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Display the source nodes (document chunks) that were used to generate the response\n", | |
"print(\"SOURCE DOCUMENTS USED:\")\n", | |
"print(\"=\" * 50)\n", | |
"\n", | |
"for i, source_node in enumerate(response.source_nodes, 1):\n", | |
" print(f\"\\nSource {i}:\")\n", | |
" print(f\"File: {source_node.metadata.get('file_name', 'Unknown')}\")\n", | |
" print(f\"Similarity Score: {source_node.score:.3f}\")\n", | |
" print(f\"Content Preview: {source_node.text[:200]}...\")\n", | |
" print(\"-\" * 30)" | |
], | |
"metadata": { | |
"id": "V0ZT7mVfairV" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "r6Iqf7tj8MqT" | |
}, | |
"source": [ | |
"## 10. Index Persistence (Optional)\n", | |
"\n", | |
"Save the vector index to avoid rebuilding it every time you run the notebook." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Optional: Save the index to disk for faster loading next time\n", | |
"PERSIST_DIR = \"./storage\"\n", | |
"\n", | |
"def save_index():\n", | |
" \"\"\"Save the current index to disk.\"\"\"\n", | |
" try:\n", | |
" index.storage_context.persist(persist_dir=PERSIST_DIR)\n", | |
" print(f\"✓ Index saved to {PERSIST_DIR}\")\n", | |
" except Exception as e:\n", | |
" print(f\"❌ Error saving index: {e}\")\n", | |
"\n", | |
"def load_index():\n", | |
" \"\"\"Load a previously saved index from disk.\"\"\"\n", | |
" try:\n", | |
" from llama_index.core import StorageContext, load_index_from_storage\n", | |
" storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)\n", | |
" index = load_index_from_storage(storage_context)\n", | |
" print(f\"✓ Index loaded from {PERSIST_DIR}\")\n", | |
" return index\n", | |
" except Exception as e:\n", | |
" print(f\"❌ Could not load saved index: {e}\")\n", | |
" return None\n", | |
"\n", | |
"# Uncomment the line below to save the current index\n", | |
"# save_index()\n", | |
"\n", | |
"print(\"Index persistence functions defined.\")\n", | |
"print(\"Use save_index() to save and load_index() to reload.\")" | |
], | |
"metadata": { | |
"id": "RYcEqobn8MqT" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "jpPX3S-Z8MqT" | |
}, | |
"source": [ | |
"## 11. Performance Monitoring\n", | |
"\n", | |
"Monitor system performance and query statistics." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import time\n", | |
"from collections import defaultdict\n", | |
"\n", | |
"# Performance tracking\n", | |
"query_stats = defaultdict(list)\n", | |
"\n", | |
"def enhanced_query(question, show_sources=True, show_timing=True):\n", | |
" \"\"\"Enhanced query function with performance monitoring.\"\"\"\n", | |
" start_time = time.time()\n", | |
"\n", | |
" print(f\"QUESTION: {question}\")\n", | |
" print(\"=\" * 70)\n", | |
"\n", | |
" try:\n", | |
" response = query_engine.query(question)\n", | |
" query_time = time.time() - start_time\n", | |
"\n", | |
" # Track performance\n", | |
" query_stats['response_times'].append(query_time)\n", | |
" query_stats['questions'].append(question)\n", | |
"\n", | |
" print(\"ANSWER:\")\n", | |
" print(response)\n", | |
"\n", | |
" if show_timing:\n", | |
" print(f\"\\n⏱️ Response time: {query_time:.2f} seconds\")\n", | |
"\n", | |
" if show_sources and hasattr(response, 'source_nodes'):\n", | |
" print(\"\\nSOURCES:\")\n", | |
" for i, node in enumerate(response.source_nodes, 1):\n", | |
" file_name = node.metadata.get('file_name', 'Unknown')\n", | |
" score = getattr(node, 'score', 0)\n", | |
" print(f\" {i}. {file_name} (relevance: {score:.3f})\")\n", | |
"\n", | |
" print(\"=\" * 70)\n", | |
" return response\n", | |
"\n", | |
" except Exception as e:\n", | |
" print(f\"❌ Error during query: {e}\")\n", | |
" return None\n", | |
"\n", | |
"def show_performance_stats():\n", | |
" \"\"\"Display performance statistics.\"\"\"\n", | |
" if not query_stats['response_times']:\n", | |
" print(\"No queries performed yet.\")\n", | |
" return\n", | |
"\n", | |
" times = query_stats['response_times']\n", | |
" print(f\"📊 PERFORMANCE STATS (based on {len(times)} queries):\")\n", | |
" print(f\" Average response time: {sum(times)/len(times):.2f}s\")\n", | |
" print(f\" Fastest response: {min(times):.2f}s\")\n", | |
" print(f\" Slowest response: {max(times):.2f}s\")\n", | |
"\n", | |
"print(\"✓ Enhanced query functions ready!\")\n", | |
"print(\"Use enhanced_query('your question') for better monitoring.\")" | |
], | |
"metadata": { | |
"id": "74SToiDP8MqT" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "U7ilYtCi8MqU" | |
}, | |
"source": [ | |
"## 12. Sample Queries and Best Practices\n", | |
"\n", | |
"Try these example queries and learn best practices for effective document search." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# Sample queries demonstrating different types of questions\n", | |
"sample_queries = [\n", | |
" \"What are the main themes in women's religious studies?\",\n", | |
" \"Who are the key scholars mentioned in these documents?\",\n", | |
" \"What methodologies are discussed for studying women in religion?\",\n", | |
" \"What historical periods are covered in the research?\",\n", | |
" \"What are the primary sources used in this research?\"\n", | |
"]\n", | |
"\n", | |
"def run_sample_queries():\n", | |
" \"\"\"Run a selection of sample queries to demonstrate the system.\"\"\"\n", | |
" print(\"Running sample queries...\\n\")\n", | |
"\n", | |
" for i, query in enumerate(sample_queries[:2], 1): # Run first 2 by default\n", | |
" print(f\"\\n--- Sample Query {i} ---\")\n", | |
" enhanced_query(query, show_sources=False)\n", | |
" time.sleep(1) # Brief pause between queries\n", | |
"\n", | |
" print(\"\\n💡 TIP: Modify the queries above or create your own!\")\n", | |
"\n", | |
"def query_best_practices():\n", | |
" \"\"\"Display best practices for querying the system.\"\"\"\n", | |
" practices = [\n", | |
" \"🎯 Be specific: 'What does Smith say about feminist theology?' vs 'Tell me about feminism'\",\n", | |
" \"📚 Ask about concepts: 'What is religious authority?' rather than just names\",\n", | |
" \"🔍 Use comparative questions: 'How do different scholars approach this topic?'\",\n", | |
" \"📊 Request analysis: 'What are the main arguments presented?'\",\n", | |
" \"🏛️ Ask about methodology: 'What research methods are discussed?'\",\n", | |
" \"📖 Reference specific documents: 'What does the Johnson paper conclude?'\"\n", | |
" ]\n", | |
"\n", | |
" print(\"📋 BEST PRACTICES FOR QUERIES:\")\n", | |
" for practice in practices:\n", | |
" print(f\" {practice}\")\n", | |
"\n", | |
"# Uncomment to run sample queries:\n", | |
"# run_sample_queries()\n", | |
"\n", | |
"query_best_practices()\n", | |
"print(\"\\n✓ Ready for your questions! Use enhanced_query('your question here')\")" | |
], | |
"metadata": { | |
"id": "C4PwlICz8MqU" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment