CliffordAnderson · July 1, 2025 17:07
diff --git a/women-in-religion-llamaindex.ipynb b/women-in-religion-llamaindex.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "private_outputs": true,
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/CliffordAnderson/fa737d6962aa3589089060dd41f97699/women-in-religion-llamaindex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QKH3WMzX8MqI"
      },
      "source": [
        "## 1. Installation of Required Libraries\n",
        "\n",
        "First, we'll install the core LlamaIndex libraries needed for document indexing and querying."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RGB4dyj0JAXQ"
      },
      "outputs": [],
      "source": [
        "# Install core LlamaIndex libraries for document processing and indexing\n",
        "%pip install -U llama-index llama-index-core\n",
        "\n",
        "# Install OpenRouter integration for accessing various LLMs through a unified API\n",
        "%pip install llama-index-llms-openrouter"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "b8f31f44"
      },
      "source": [
        "# Install embedding libraries for converting text to vector representations\n",
        "%pip install sentence-transformers  # Required for local embedding models\n",
        "%pip install llama-index-embeddings-huggingface  # HuggingFace embedding integration"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LAs7uflK8MqM"
      },
      "source": [
        "## 2. Google Drive Setup\n",
        "\n",
        "Mount Google Drive to access the document collection stored in the \"Women in Religion AI\" folder."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import drive\n",
        "\n",
        "# Mount Google Drive to access files stored in the cloud\n",
        "drive.mount(\"/content/drive/\")\n",
        "\n",
        "# Navigate to the specific folder containing our document collection\n",
        "%cd \"/content/drive/My Drive/Women in Religion AI\""
      ],
      "metadata": {
        "id": "-8nwoGx8Vf34"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "\n",
        "# Get current working directory and list available files\n",
        "cwd = os.getcwd()\n",
        "files = os.listdir(cwd)\n",
        "\n",
        "print(\"Available files in the Women in Religion AI folder:\")\n",
        "for file in files:\n",
        "    print(f\"  - {file}\")"
      ],
      "metadata": {
        "id": "k_UAfBDIXdFb"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qwLvjrUj8MqN"
      },
      "source": [
        "## 3. API Configuration\n",
        "\n",
        "Set up the OpenRouter API key to access language models. OpenRouter provides a unified interface to multiple LLM providers."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "from google.colab import userdata\n",
        "\n",
        "# Retrieve the OpenRouter API key from Colab's secure storage\n",
        "# This key should be stored in Colab's \"Secrets\" section for security\n",
        "try:\n",
        "    api_key = userdata.get('OPEN_ROUTER_API')\n",
        "    if not api_key:\n",
        "        raise ValueError(\"API key is empty\")\n",
        "    os.environ['OPENROUTER_API_KEY'] = api_key\n",
        "    print(\"✓ OpenRouter API key configured successfully\")\n",
        "except Exception as e:\n",
        "    print(\"❌ Error: Could not retrieve OpenRouter API key\")\n",
        "    print(\"Please ensure you've added 'OPEN_ROUTER_API' to Colab's Secrets section\")\n",
        "    print(\"Go to the left sidebar → 🔑 Secrets → Add new secret\")\n",
        "    raise"
      ],
      "metadata": {
        "id": "CDsOgr2AJPTc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9ygGsY3y8MqO"
      },
      "source": [
        "## 4. Language Model Setup\n",
        "\n",
        "Configure the language model that will generate responses to our queries."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.llms.openrouter import OpenRouter\n",
        "\n",
        "# Initialize the language model with specific parameters\n",
        "try:\n",
        "    llm = OpenRouter(\n",
        "        api_key=os.environ['OPENROUTER_API_KEY'],\n",
        "        max_tokens=512,      # Increased for more detailed responses\n",
        "        context_window=4096, # Maximum context size the model can process\n",
        "        model=\"openai/gpt-4o\",  # Using GPT-4o via OpenRouter\n",
        "        temperature=0.1      # Low temperature for more consistent, factual responses\n",
        "    )\n",
        "\n",
        "    # Test the connection with a simple query\n",
        "    test_response = llm.complete(\"Hello\")\n",
        "    print(\"✓ Language model (GPT-4o) configured and tested successfully\")\n",
        "\n",
        "except Exception as e:\n",
        "    print(f\"❌ Error configuring language model: {e}\")\n",
        "    print(\"Check your API key and internet connection\")\n",
        "    raise"
      ],
      "metadata": {
        "id": "LUUtOFZH7gGT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iYsrnwQg8MqP"
      },
      "source": [
        "## 5. Embedding Model Configuration\n",
        "\n",
        "Set up the embedding model that will convert text into vector representations for semantic search."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
        "from llama_index.core import Settings\n",
        "\n",
        "# Configure a lightweight, efficient embedding model from HuggingFace\n",
        "# BGE-small is optimized for English text and provides good performance\n",
        "Settings.embed_model = HuggingFaceEmbedding(\n",
        "    model_name=\"BAAI/bge-small-en-v1.5\"\n",
        ")\n",
        "\n",
        "print(\"✓ Embedding model (BGE-small-en-v1.5) configured successfully\")"
      ],
      "metadata": {
        "id": "gPS7EZdCRqV_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ol1seAmB8MqQ"
      },
      "source": [
        "## 6. Document Loading\n",
        "\n",
        "Load all documents from the current directory for processing and indexing."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0371970e-c11c-4534-aa22-7cfdfe411bb3"
      },
      "outputs": [],
      "source": [
        "from llama_index.core import SimpleDirectoryReader\n",
        "\n",
        "# Load all documents from the current directory\n",
        "# SimpleDirectoryReader automatically handles various file formats\n",
        "try:\n",
        "    documents = SimpleDirectoryReader(\n",
        "        input_dir=os.getcwd(),\n",
        "        exclude_hidden=True,  # Skip hidden files\n",
        "        recursive=True        # Include subdirectories\n",
        "    ).load_data()\n",
        "\n",
        "    if not documents:\n",
        "        print(\"⚠️ Warning: No documents found in the current directory\")\n",
        "        print(\"Please ensure your documents are in the correct folder\")\n",
        "    else:\n",
        "        print(f\"✓ Successfully loaded {len(documents)} documents\")\n",
        "\n",
        "        # Display document statistics\n",
        "        file_types = {}\n",
        "        total_chars = 0\n",
        "\n",
        "        for doc in documents:\n",
        "            file_name = doc.metadata.get('file_name', 'unknown')\n",
        "            file_ext = os.path.splitext(file_name)[1].lower() or 'no extension'\n",
        "            file_types[file_ext] = file_types.get(file_ext, 0) + 1\n",
        "            total_chars += len(doc.text)\n",
        "\n",
        "        print(f\"File types: {dict(file_types)}\")\n",
        "        print(f\"Total content: ~{total_chars:,} characters\")\n",
        "\n",
        "except Exception as e:\n",
        "    print(f\"❌ Error loading documents: {e}\")\n",
        "    print(\"Check that you're in the correct directory and have readable files\")\n",
        "    raise"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "k0nDfeF48MqR"
      },
      "source": [
        "## 7. Vector Index Creation\n",
        "\n",
        "Create a vector store index from the loaded documents. This process:\n",
        "1. Splits documents into chunks\n",
        "2. Generates embeddings for each chunk\n",
        "3. Stores vectors for fast similarity search"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from llama_index.core import VectorStoreIndex\n",
        "from llama_index.core.node_parser import SentenceSplitter\n",
        "import time\n",
        "\n",
        "# Configure text splitting for better chunking\n",
        "text_splitter = SentenceSplitter(\n",
        "    chunk_size=512,      # Smaller chunks for better precision\n",
        "    chunk_overlap=50     # Overlap to preserve context\n",
        ")\n",
        "\n",
        "# Create a vector store index from the loaded documents\n",
        "print(\"Creating vector index... This may take a few minutes.\")\n",
        "start_time = time.time()\n",
        "\n",
        "try:\n",
        "    index = VectorStoreIndex.from_documents(\n",
        "        documents,\n",
        "        transformations=[text_splitter],  # Use custom text splitter\n",
        "        show_progress=True               # Show progress during indexing\n",
        "    )\n",
        "\n",
        "    elapsed_time = time.time() - start_time\n",
        "    print(f\"✓ Vector index created successfully in {elapsed_time:.1f} seconds\")\n",
        "    print(\"The system is now ready to answer questions about the documents!\")\n",
        "\n",
        "except Exception as e:\n",
        "    print(f\"❌ Error creating vector index: {e}\")\n",
        "    print(\"This might be due to memory limitations or embedding model issues\")\n",
        "    raise"
      ],
      "metadata": {
        "id": "v_k6Cf4xSKxu"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pbFG5UQX8MqR"
      },
      "source": [
        "## 8. Query Engine Setup and Testing\n",
        "\n",
        "Create a query engine that combines vector search with language generation to answer questions about the documents."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Create a query engine that combines retrieval and generation\n",
        "query_engine = index.as_query_engine(\n",
        "    llm=llm,                    # Use our configured language model\n",
        "    similarity_top_k=5,         # Retrieve top 5 most relevant document chunks\n",
        "    response_mode=\"tree_summarize\",  # Better synthesis of multiple sources\n",
        "    verbose=True                # Show retrieval details\n",
        ")\n",
        "\n",
        "# Test the system with a sample query\n",
        "print(\"Testing the system with a sample query...\\n\")\n",
        "\n",
        "try:\n",
        "    response = query_engine.query(\"Who is Elizabeth Ursic?\")\n",
        "\n",
        "    print(\"RESPONSE:\")\n",
        "    print(\"=\" * 50)\n",
        "    print(response)\n",
        "    print(\"=\" * 50)\n",
        "\n",
        "    # Check response quality\n",
        "    if len(str(response).strip()) < 20:\n",
        "        print(\"⚠️ Warning: Response seems very short. This might indicate:\")\n",
        "        print(\"  - No relevant information found in documents\")\n",
        "        print(\"  - API issues or configuration problems\")\n",
        "\n",
        "except Exception as e:\n",
        "    print(f\"❌ Error during query: {e}\")\n",
        "    print(\"Check your API connection and try again\")\n",
        "    raise"
      ],
      "metadata": {
        "id": "ihPo2qMZKfqy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "3QpXtnLW8MqS"
      },
      "source": [
        "## 9. Source Document Inspection\n",
        "\n",
        "Examine which specific document sections were used to generate the response."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Display the source nodes (document chunks) that were used to generate the response\n",
        "print(\"SOURCE DOCUMENTS USED:\")\n",
        "print(\"=\" * 50)\n",
        "\n",
        "for i, source_node in enumerate(response.source_nodes, 1):\n",
        "    print(f\"\\nSource {i}:\")\n",
        "    print(f\"File: {source_node.metadata.get('file_name', 'Unknown')}\")\n",
        "    print(f\"Similarity Score: {source_node.score:.3f}\")\n",
        "    print(f\"Content Preview: {source_node.text[:200]}...\")\n",
        "    print(\"-\" * 30)"
      ],
      "metadata": {
        "id": "V0ZT7mVfairV"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "r6Iqf7tj8MqT"
      },
      "source": [
        "## 10. Index Persistence (Optional)\n",
        "\n",
        "Save the vector index to avoid rebuilding it every time you run the notebook."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Optional: Save the index to disk for faster loading next time\n",
        "PERSIST_DIR = \"./storage\"\n",
        "\n",
        "def save_index():\n",
        "    \"\"\"Save the current index to disk.\"\"\"\n",
        "    try:\n",
        "        index.storage_context.persist(persist_dir=PERSIST_DIR)\n",
        "        print(f\"✓ Index saved to {PERSIST_DIR}\")\n",
        "    except Exception as e:\n",
        "        print(f\"❌ Error saving index: {e}\")\n",
        "\n",
        "def load_index():\n",
        "    \"\"\"Load a previously saved index from disk.\"\"\"\n",
        "    try:\n",
        "        from llama_index.core import StorageContext, load_index_from_storage\n",
        "        storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)\n",
        "        index = load_index_from_storage(storage_context)\n",
        "        print(f\"✓ Index loaded from {PERSIST_DIR}\")\n",
        "        return index\n",
        "    except Exception as e:\n",
        "        print(f\"❌ Could not load saved index: {e}\")\n",
        "        return None\n",
        "\n",
        "# Uncomment the line below to save the current index\n",
        "# save_index()\n",
        "\n",
        "print(\"Index persistence functions defined.\")\n",
        "print(\"Use save_index() to save and load_index() to reload.\")"
      ],
      "metadata": {
        "id": "RYcEqobn8MqT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jpPX3S-Z8MqT"
      },
      "source": [
        "## 11. Performance Monitoring\n",
        "\n",
        "Monitor system performance and query statistics."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import time\n",
        "from collections import defaultdict\n",
        "\n",
        "# Performance tracking\n",
        "query_stats = defaultdict(list)\n",
        "\n",
        "def enhanced_query(question, show_sources=True, show_timing=True):\n",
        "    \"\"\"Enhanced query function with performance monitoring.\"\"\"\n",
        "    start_time = time.time()\n",
        "\n",
        "    print(f\"QUESTION: {question}\")\n",
        "    print(\"=\" * 70)\n",
        "\n",
        "    try:\n",
        "        response = query_engine.query(question)\n",
        "        query_time = time.time() - start_time\n",
        "\n",
        "        # Track performance\n",
        "        query_stats['response_times'].append(query_time)\n",
        "        query_stats['questions'].append(question)\n",
        "\n",
        "        print(\"ANSWER:\")\n",
        "        print(response)\n",
        "\n",
        "        if show_timing:\n",
        "            print(f\"\\n⏱️ Response time: {query_time:.2f} seconds\")\n",
        "\n",
        "        if show_sources and hasattr(response, 'source_nodes'):\n",
        "            print(\"\\nSOURCES:\")\n",
        "            for i, node in enumerate(response.source_nodes, 1):\n",
        "                file_name = node.metadata.get('file_name', 'Unknown')\n",
        "                score = getattr(node, 'score', 0)\n",
        "                print(f\"  {i}. {file_name} (relevance: {score:.3f})\")\n",
        "\n",
        "        print(\"=\" * 70)\n",
        "        return response\n",
        "\n",
        "    except Exception as e:\n",
        "        print(f\"❌ Error during query: {e}\")\n",
        "        return None\n",
        "\n",
        "def show_performance_stats():\n",
        "    \"\"\"Display performance statistics.\"\"\"\n",
        "    if not query_stats['response_times']:\n",
        "        print(\"No queries performed yet.\")\n",
        "        return\n",
        "\n",
        "    times = query_stats['response_times']\n",
        "    print(f\"📊 PERFORMANCE STATS (based on {len(times)} queries):\")\n",
        "    print(f\"  Average response time: {sum(times)/len(times):.2f}s\")\n",
        "    print(f\"  Fastest response: {min(times):.2f}s\")\n",
        "    print(f\"  Slowest response: {max(times):.2f}s\")\n",
        "\n",
        "print(\"✓ Enhanced query functions ready!\")\n",
        "print(\"Use enhanced_query('your question') for better monitoring.\")"
      ],
      "metadata": {
        "id": "74SToiDP8MqT"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "U7ilYtCi8MqU"
      },
      "source": [
        "## 12. Sample Queries and Best Practices\n",
        "\n",
        "Try these example queries and learn best practices for effective document search."
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Sample queries demonstrating different types of questions\n",
        "sample_queries = [\n",
        "    \"What are the main themes in women's religious studies?\",\n",
        "    \"Who are the key scholars mentioned in these documents?\",\n",
        "    \"What methodologies are discussed for studying women in religion?\",\n",
        "    \"What historical periods are covered in the research?\",\n",
        "    \"What are the primary sources used in this research?\"\n",
        "]\n",
        "\n",
        "def run_sample_queries():\n",
        "    \"\"\"Run a selection of sample queries to demonstrate the system.\"\"\"\n",
        "    print(\"Running sample queries...\\n\")\n",
        "\n",
        "    for i, query in enumerate(sample_queries[:2], 1):  # Run first 2 by default\n",
        "        print(f\"\\n--- Sample Query {i} ---\")\n",
        "        enhanced_query(query, show_sources=False)\n",
        "        time.sleep(1)  # Brief pause between queries\n",
        "\n",
        "    print(\"\\n💡 TIP: Modify the queries above or create your own!\")\n",
        "\n",
        "def query_best_practices():\n",
        "    \"\"\"Display best practices for querying the system.\"\"\"\n",
        "    practices = [\n",
        "        \"🎯 Be specific: 'What does Smith say about feminist theology?' vs 'Tell me about feminism'\",\n",
        "        \"📚 Ask about concepts: 'What is religious authority?' rather than just names\",\n",
        "        \"🔍 Use comparative questions: 'How do different scholars approach this topic?'\",\n",
        "        \"📊 Request analysis: 'What are the main arguments presented?'\",\n",
        "        \"🏛️ Ask about methodology: 'What research methods are discussed?'\",\n",
        "        \"📖 Reference specific documents: 'What does the Johnson paper conclude?'\"\n",
        "    ]\n",
        "\n",
        "    print(\"📋 BEST PRACTICES FOR QUERIES:\")\n",
        "    for practice in practices:\n",
        "        print(f\"  {practice}\")\n",
        "\n",
        "# Uncomment to run sample queries:\n",
        "# run_sample_queries()\n",
        "\n",
        "query_best_practices()\n",
        "print(\"\\n✓ Ready for your questions! Use enhanced_query('your question here')\")"
      ],
      "metadata": {
        "id": "C4PwlICz8MqU"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"private_outputs": true,
	"provenance": [],
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/CliffordAnderson/fa737d6962aa3589089060dd41f97699/women-in-religion-llamaindex.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "QKH3WMzX8MqI"
	},
	"source": [
	"## 1. Installation of Required Libraries\n",
	"\n",
	"First, we'll install the core LlamaIndex libraries needed for document indexing and querying."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "RGB4dyj0JAXQ"
	},
	"outputs": [],
	"source": [
	"# Install core LlamaIndex libraries for document processing and indexing\n",
	"%pip install -U llama-index llama-index-core\n",
	"\n",
	"# Install OpenRouter integration for accessing various LLMs through a unified API\n",
	"%pip install llama-index-llms-openrouter"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "b8f31f44"
	},
	"source": [
	"# Install embedding libraries for converting text to vector representations\n",
	"%pip install sentence-transformers # Required for local embedding models\n",
	"%pip install llama-index-embeddings-huggingface # HuggingFace embedding integration"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "LAs7uflK8MqM"
	},
	"source": [
	"## 2. Google Drive Setup\n",
	"\n",
	"Mount Google Drive to access the document collection stored in the \"Women in Religion AI\" folder."
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from google.colab import drive\n",
	"\n",
	"# Mount Google Drive to access files stored in the cloud\n",
	"drive.mount(\"/content/drive/\")\n",
	"\n",
	"# Navigate to the specific folder containing our document collection\n",
	"%cd \"/content/drive/My Drive/Women in Religion AI\""
	],
	"metadata": {
	"id": "-8nwoGx8Vf34"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import os\n",
	"\n",
	"# Get current working directory and list available files\n",
	"cwd = os.getcwd()\n",
	"files = os.listdir(cwd)\n",
	"\n",
	"print(\"Available files in the Women in Religion AI folder:\")\n",
	"for file in files:\n",
	" print(f\" - {file}\")"
	],
	"metadata": {
	"id": "k_UAfBDIXdFb"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "qwLvjrUj8MqN"
	},
	"source": [
	"## 3. API Configuration\n",
	"\n",
	"Set up the OpenRouter API key to access language models. OpenRouter provides a unified interface to multiple LLM providers."
	]
	},
	{
	"cell_type": "code",
	"source": [
	"import os\n",
	"from google.colab import userdata\n",
	"\n",
	"# Retrieve the OpenRouter API key from Colab's secure storage\n",
	"# This key should be stored in Colab's \"Secrets\" section for security\n",
	"try:\n",
	" api_key = userdata.get('OPEN_ROUTER_API')\n",
	" if not api_key:\n",
	" raise ValueError(\"API key is empty\")\n",
	" os.environ['OPENROUTER_API_KEY'] = api_key\n",
	" print(\"✓ OpenRouter API key configured successfully\")\n",
	"except Exception as e:\n",
	" print(\"❌ Error: Could not retrieve OpenRouter API key\")\n",
	" print(\"Please ensure you've added 'OPEN_ROUTER_API' to Colab's Secrets section\")\n",
	" print(\"Go to the left sidebar → 🔑 Secrets → Add new secret\")\n",
	" raise"
	],
	"metadata": {
	"id": "CDsOgr2AJPTc"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "9ygGsY3y8MqO"
	},
	"source": [
	"## 4. Language Model Setup\n",
	"\n",
	"Configure the language model that will generate responses to our queries."
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from llama_index.llms.openrouter import OpenRouter\n",
	"\n",
	"# Initialize the language model with specific parameters\n",
	"try:\n",
	" llm = OpenRouter(\n",
	" api_key=os.environ['OPENROUTER_API_KEY'],\n",
	" max_tokens=512, # Increased for more detailed responses\n",
	" context_window=4096, # Maximum context size the model can process\n",
	" model=\"openai/gpt-4o\", # Using GPT-4o via OpenRouter\n",
	" temperature=0.1 # Low temperature for more consistent, factual responses\n",
	" )\n",
	"\n",
	" # Test the connection with a simple query\n",
	" test_response = llm.complete(\"Hello\")\n",
	" print(\"✓ Language model (GPT-4o) configured and tested successfully\")\n",
	"\n",
	"except Exception as e:\n",
	" print(f\"❌ Error configuring language model: {e}\")\n",
	" print(\"Check your API key and internet connection\")\n",
	" raise"
	],
	"metadata": {
	"id": "LUUtOFZH7gGT"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "iYsrnwQg8MqP"
	},
	"source": [
	"## 5. Embedding Model Configuration\n",
	"\n",
	"Set up the embedding model that will convert text into vector representations for semantic search."
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from llama_index.embeddings.huggingface import HuggingFaceEmbedding\n",
	"from llama_index.core import Settings\n",
	"\n",
	"# Configure a lightweight, efficient embedding model from HuggingFace\n",
	"# BGE-small is optimized for English text and provides good performance\n",
	"Settings.embed_model = HuggingFaceEmbedding(\n",
	" model_name=\"BAAI/bge-small-en-v1.5\"\n",
	")\n",
	"\n",
	"print(\"✓ Embedding model (BGE-small-en-v1.5) configured successfully\")"
	],
	"metadata": {
	"id": "gPS7EZdCRqV_"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "ol1seAmB8MqQ"
	},
	"source": [
	"## 6. Document Loading\n",
	"\n",
	"Load all documents from the current directory for processing and indexing."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "0371970e-c11c-4534-aa22-7cfdfe411bb3"
	},
	"outputs": [],
	"source": [
	"from llama_index.core import SimpleDirectoryReader\n",
	"\n",
	"# Load all documents from the current directory\n",
	"# SimpleDirectoryReader automatically handles various file formats\n",
	"try:\n",
	" documents = SimpleDirectoryReader(\n",
	" input_dir=os.getcwd(),\n",
	" exclude_hidden=True, # Skip hidden files\n",
	" recursive=True # Include subdirectories\n",
	" ).load_data()\n",
	"\n",
	" if not documents:\n",
	" print(\"⚠️ Warning: No documents found in the current directory\")\n",
	" print(\"Please ensure your documents are in the correct folder\")\n",
	" else:\n",
	" print(f\"✓ Successfully loaded {len(documents)} documents\")\n",
	"\n",
	" # Display document statistics\n",
	" file_types = {}\n",
	" total_chars = 0\n",
	"\n",
	" for doc in documents:\n",
	" file_name = doc.metadata.get('file_name', 'unknown')\n",
	" file_ext = os.path.splitext(file_name)[1].lower() or 'no extension'\n",
	" file_types[file_ext] = file_types.get(file_ext, 0) + 1\n",
	" total_chars += len(doc.text)\n",
	"\n",
	" print(f\"File types: {dict(file_types)}\")\n",
	" print(f\"Total content: ~{total_chars:,} characters\")\n",
	"\n",
	"except Exception as e:\n",
	" print(f\"❌ Error loading documents: {e}\")\n",
	" print(\"Check that you're in the correct directory and have readable files\")\n",
	" raise"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "k0nDfeF48MqR"
	},
	"source": [
	"## 7. Vector Index Creation\n",
	"\n",
	"Create a vector store index from the loaded documents. This process:\n",
	"1. Splits documents into chunks\n",
	"2. Generates embeddings for each chunk\n",
	"3. Stores vectors for fast similarity search"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from llama_index.core import VectorStoreIndex\n",
	"from llama_index.core.node_parser import SentenceSplitter\n",
	"import time\n",
	"\n",
	"# Configure text splitting for better chunking\n",
	"text_splitter = SentenceSplitter(\n",
	" chunk_size=512, # Smaller chunks for better precision\n",
	" chunk_overlap=50 # Overlap to preserve context\n",
	")\n",
	"\n",
	"# Create a vector store index from the loaded documents\n",
	"print(\"Creating vector index... This may take a few minutes.\")\n",
	"start_time = time.time()\n",
	"\n",
	"try:\n",
	" index = VectorStoreIndex.from_documents(\n",
	" documents,\n",
	" transformations=[text_splitter], # Use custom text splitter\n",
	" show_progress=True # Show progress during indexing\n",
	" )\n",
	"\n",
	" elapsed_time = time.time() - start_time\n",
	" print(f\"✓ Vector index created successfully in {elapsed_time:.1f} seconds\")\n",
	" print(\"The system is now ready to answer questions about the documents!\")\n",
	"\n",
	"except Exception as e:\n",
	" print(f\"❌ Error creating vector index: {e}\")\n",
	" print(\"This might be due to memory limitations or embedding model issues\")\n",
	" raise"
	],
	"metadata": {
	"id": "v_k6Cf4xSKxu"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "pbFG5UQX8MqR"
	},
	"source": [
	"## 8. Query Engine Setup and Testing\n",
	"\n",
	"Create a query engine that combines vector search with language generation to answer questions about the documents."
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Create a query engine that combines retrieval and generation\n",
	"query_engine = index.as_query_engine(\n",
	" llm=llm, # Use our configured language model\n",
	" similarity_top_k=5, # Retrieve top 5 most relevant document chunks\n",
	" response_mode=\"tree_summarize\", # Better synthesis of multiple sources\n",
	" verbose=True # Show retrieval details\n",
	")\n",
	"\n",
	"# Test the system with a sample query\n",
	"print(\"Testing the system with a sample query...\\n\")\n",
	"\n",
	"try:\n",
	" response = query_engine.query(\"Who is Elizabeth Ursic?\")\n",
	"\n",
	" print(\"RESPONSE:\")\n",
	" print(\"=\" * 50)\n",
	" print(response)\n",
	" print(\"=\" * 50)\n",
	"\n",
	" # Check response quality\n",
	" if len(str(response).strip()) < 20:\n",
	" print(\"⚠️ Warning: Response seems very short. This might indicate:\")\n",
	" print(\" - No relevant information found in documents\")\n",
	" print(\" - API issues or configuration problems\")\n",
	"\n",
	"except Exception as e:\n",
	" print(f\"❌ Error during query: {e}\")\n",
	" print(\"Check your API connection and try again\")\n",
	" raise"
	],
	"metadata": {
	"id": "ihPo2qMZKfqy"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "3QpXtnLW8MqS"
	},
	"source": [
	"## 9. Source Document Inspection\n",
	"\n",
	"Examine which specific document sections were used to generate the response."
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Display the source nodes (document chunks) that were used to generate the response\n",
	"print(\"SOURCE DOCUMENTS USED:\")\n",
	"print(\"=\" * 50)\n",
	"\n",
	"for i, source_node in enumerate(response.source_nodes, 1):\n",
	" print(f\"\\nSource {i}:\")\n",
	" print(f\"File: {source_node.metadata.get('file_name', 'Unknown')}\")\n",
	" print(f\"Similarity Score: {source_node.score:.3f}\")\n",
	" print(f\"Content Preview: {source_node.text[:200]}...\")\n",
	" print(\"-\" * 30)"
	],
	"metadata": {
	"id": "V0ZT7mVfairV"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "r6Iqf7tj8MqT"
	},
	"source": [
	"## 10. Index Persistence (Optional)\n",
	"\n",
	"Save the vector index to avoid rebuilding it every time you run the notebook."
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Optional: Save the index to disk for faster loading next time\n",
	"PERSIST_DIR = \"./storage\"\n",
	"\n",
	"def save_index():\n",
	" \"\"\"Save the current index to disk.\"\"\"\n",
	" try:\n",
	" index.storage_context.persist(persist_dir=PERSIST_DIR)\n",
	" print(f\"✓ Index saved to {PERSIST_DIR}\")\n",
	" except Exception as e:\n",
	" print(f\"❌ Error saving index: {e}\")\n",
	"\n",
	"def load_index():\n",
	" \"\"\"Load a previously saved index from disk.\"\"\"\n",
	" try:\n",
	" from llama_index.core import StorageContext, load_index_from_storage\n",
	" storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)\n",
	" index = load_index_from_storage(storage_context)\n",
	" print(f\"✓ Index loaded from {PERSIST_DIR}\")\n",
	" return index\n",
	" except Exception as e:\n",
	" print(f\"❌ Could not load saved index: {e}\")\n",
	" return None\n",
	"\n",
	"# Uncomment the line below to save the current index\n",
	"# save_index()\n",
	"\n",
	"print(\"Index persistence functions defined.\")\n",
	"print(\"Use save_index() to save and load_index() to reload.\")"
	],
	"metadata": {
	"id": "RYcEqobn8MqT"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "jpPX3S-Z8MqT"
	},
	"source": [
	"## 11. Performance Monitoring\n",
	"\n",
	"Monitor system performance and query statistics."
	]
	},
	{
	"cell_type": "code",
	"source": [
	"import time\n",
	"from collections import defaultdict\n",
	"\n",
	"# Performance tracking\n",
	"query_stats = defaultdict(list)\n",
	"\n",
	"def enhanced_query(question, show_sources=True, show_timing=True):\n",
	" \"\"\"Enhanced query function with performance monitoring.\"\"\"\n",
	" start_time = time.time()\n",
	"\n",
	" print(f\"QUESTION: {question}\")\n",
	" print(\"=\" * 70)\n",
	"\n",
	" try:\n",
	" response = query_engine.query(question)\n",
	" query_time = time.time() - start_time\n",
	"\n",
	" # Track performance\n",
	" query_stats['response_times'].append(query_time)\n",
	" query_stats['questions'].append(question)\n",
	"\n",
	" print(\"ANSWER:\")\n",
	" print(response)\n",
	"\n",
	" if show_timing:\n",
	" print(f\"\\n⏱️ Response time: {query_time:.2f} seconds\")\n",
	"\n",
	" if show_sources and hasattr(response, 'source_nodes'):\n",
	" print(\"\\nSOURCES:\")\n",
	" for i, node in enumerate(response.source_nodes, 1):\n",
	" file_name = node.metadata.get('file_name', 'Unknown')\n",
	" score = getattr(node, 'score', 0)\n",
	" print(f\" {i}. {file_name} (relevance: {score:.3f})\")\n",
	"\n",
	" print(\"=\" * 70)\n",
	" return response\n",
	"\n",
	" except Exception as e:\n",
	" print(f\"❌ Error during query: {e}\")\n",
	" return None\n",
	"\n",
	"def show_performance_stats():\n",
	" \"\"\"Display performance statistics.\"\"\"\n",
	" if not query_stats['response_times']:\n",
	" print(\"No queries performed yet.\")\n",
	" return\n",
	"\n",
	" times = query_stats['response_times']\n",
	" print(f\"📊 PERFORMANCE STATS (based on {len(times)} queries):\")\n",
	" print(f\" Average response time: {sum(times)/len(times):.2f}s\")\n",
	" print(f\" Fastest response: {min(times):.2f}s\")\n",
	" print(f\" Slowest response: {max(times):.2f}s\")\n",
	"\n",
	"print(\"✓ Enhanced query functions ready!\")\n",
	"print(\"Use enhanced_query('your question') for better monitoring.\")"
	],
	"metadata": {
	"id": "74SToiDP8MqT"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "U7ilYtCi8MqU"
	},
	"source": [
	"## 12. Sample Queries and Best Practices\n",
	"\n",
	"Try these example queries and learn best practices for effective document search."
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Sample queries demonstrating different types of questions\n",
	"sample_queries = [\n",
	" \"What are the main themes in women's religious studies?\",\n",
	" \"Who are the key scholars mentioned in these documents?\",\n",
	" \"What methodologies are discussed for studying women in religion?\",\n",
	" \"What historical periods are covered in the research?\",\n",
	" \"What are the primary sources used in this research?\"\n",
	"]\n",
	"\n",
	"def run_sample_queries():\n",
	" \"\"\"Run a selection of sample queries to demonstrate the system.\"\"\"\n",
	" print(\"Running sample queries...\\n\")\n",
	"\n",
	" for i, query in enumerate(sample_queries[:2], 1): # Run first 2 by default\n",
	" print(f\"\\n--- Sample Query {i} ---\")\n",
	" enhanced_query(query, show_sources=False)\n",
	" time.sleep(1) # Brief pause between queries\n",
	"\n",
	" print(\"\\n💡 TIP: Modify the queries above or create your own!\")\n",
	"\n",
	"def query_best_practices():\n",
	" \"\"\"Display best practices for querying the system.\"\"\"\n",
	" practices = [\n",
	" \"🎯 Be specific: 'What does Smith say about feminist theology?' vs 'Tell me about feminism'\",\n",
	" \"📚 Ask about concepts: 'What is religious authority?' rather than just names\",\n",
	" \"🔍 Use comparative questions: 'How do different scholars approach this topic?'\",\n",
	" \"📊 Request analysis: 'What are the main arguments presented?'\",\n",
	" \"🏛️ Ask about methodology: 'What research methods are discussed?'\",\n",
	" \"📖 Reference specific documents: 'What does the Johnson paper conclude?'\"\n",
	" ]\n",
	"\n",
	" print(\"📋 BEST PRACTICES FOR QUERIES:\")\n",
	" for practice in practices:\n",
	" print(f\" {practice}\")\n",
	"\n",
	"# Uncomment to run sample queries:\n",
	"# run_sample_queries()\n",
	"\n",
	"query_best_practices()\n",
	"print(\"\\n✓ Ready for your questions! Use enhanced_query('your question here')\")"
	],
	"metadata": {
	"id": "C4PwlICz8MqU"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}