celeroncoder/bm25search-database.md

Last active September 4, 2025 07:06

Star (0) You must be signed in to star a gist
Fork (0) You must be signed in to fork a gist

Select an option

Learn more about clone URLs
Clone this repository at <script src="https://gist.github.com/celeroncoder/599b2d59f2ba81f0705fd51f6c3acc16.js"></script>
Save celeroncoder/599b2d59f2ba81f0705fd51f6c3acc16 to your computer and use it in GitHub Desktop.

Download ZIP

bm25search-database

Raw

bm25search-database.md

Looking at your schema, I can see the actual structure is:

knowledgebase_entries (tenant_id, nucleus_id) → kb_content → kb_blocks (content)

Here are the only changes needed for your Prisma schema to support full-text search:

Prisma Schema Changes

model kb_blocks {
  id             String     @id @db.VarChar(255)
  blockNumber    Int        @default(autoincrement())
  kb_content_id  String
  block_type     String     @default("text")
  content        String?
  content_search String?    // Add this line for tsvector
  sequence       Int
  word_count     Int?
  char_count     Int?
  created_at     DateTime   @default(now()) @db.Timestamptz(6)
  updated_at     DateTime   @default(now()) @db.Timestamptz(6)
  prev_block_id  String?
  next_block_id  String?
  images         images[]
  kb_content     kb_content @relation(fields: [kb_content_id], references: [id], onDelete: Cascade)

  @@index([block_type])
  @@index([kb_content_id, sequence])
  @@index([content_search])  // Add this index
}

SQL Migration to Run

Create this as a custom migration file:

-- Add tsvector column
ALTER TABLE kb_blocks 
ADD COLUMN IF NOT EXISTS content_search tsvector;

-- Create GIN index for fast full-text searching
CREATE INDEX IF NOT EXISTS idx_kb_blocks_content_search 
ON kb_blocks USING GIN(content_search);

-- Function to update search vector
CREATE OR REPLACE FUNCTION update_kb_content_search()
RETURNS TRIGGER AS $$
BEGIN
    IF NEW.content IS NOT NULL THEN
        NEW.content_search = to_tsvector('english', NEW.content);
    ELSE
        NEW.content_search = NULL;
    END IF;
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

-- Trigger to auto-update search vector on insert/update
DROP TRIGGER IF EXISTS trigger_update_kb_content_search ON kb_blocks;
CREATE TRIGGER trigger_update_kb_content_search
    BEFORE INSERT OR UPDATE ON kb_blocks
    FOR EACH ROW EXECUTE FUNCTION update_kb_content_search();

-- Update existing records
UPDATE kb_blocks 
SET content_search = to_tsvector('english', content)
WHERE content IS NOT NULL AND content_search IS NULL;

TypeScript Search Service

// services/knowledgeBaseSearch.ts
import { PrismaClient } from '@prisma/client';

export interface SearchResult {
  blockId: string;
  content: string;
  rank: number;
  kbContentId: string;
  knowledgebaseId: string;
  knowledgebaseName: string;
}

export interface SearchFilters {
  tenantId: string;
  nucleusId: string;
  knowledgebaseIds?: string[]; // Optional additional filtering
  contentTypes?: string[]; // Filter by content type
  tags?: string[]; // Filter by tags
}

export class KnowledgeBaseSearchService {
  constructor(private prisma: PrismaClient) {}

  async search(
    query: string,
    filters: SearchFilters,
    options: { limit?: number; offset?: number } = {}
  ): Promise<SearchResult[]> {
    const { limit = 10, offset = 0 } = options;

    // Build dynamic WHERE conditions for additional filters
    const additionalConditions = [];
    
    if (filters.knowledgebaseIds?.length) {
      additionalConditions.push(`AND ke.id = ANY(ARRAY[${filters.knowledgebaseIds.map(id => `'${id}'`).join(',')}])`);
    }
    
    if (filters.contentTypes?.length) {
      additionalConditions.push(`AND ke."contentType" = ANY(ARRAY[${filters.contentTypes.map(type => `'${type}'`).join(',')}])`);
    }

    if (filters.tags?.length) {
      additionalConditions.push(`
        AND EXISTS (
          SELECT 1 FROM tags_on_knowledgebase_entries tke 
          JOIN tags t ON tke."tagId" = t.id 
          WHERE tke."knowledgebaseEntryId" = ke.id 
          AND t.name = ANY(ARRAY[${filters.tags.map(tag => `'${tag}'`).join(',')}])
        )
      `);
    }

    const results = await this.prisma.$queryRaw<SearchResult[]>`
      SELECT 
        kb.id as "blockId",
        kb.content,
        ts_rank(kb.content_search, plainto_tsquery('english', ${query})) as rank,
        kc.id as "kbContentId",
        ke.id as "knowledgebaseId",
        ke.knowledgebase_name as "knowledgebaseName"
      FROM kb_blocks kb
      JOIN kb_content kc ON kb.kb_content_id = kc.id
      JOIN knowledgebase_entries ke ON kc.knowledgebase_id = ke.id
      WHERE 
        ke.tenant_id = ${filters.tenantId}
        AND ke.nucleus_id = ${filters.nucleusId}
        AND kb.content_search @@ plainto_tsquery('english', ${query})
        AND kb.content IS NOT NULL
        ${additionalConditions.length > 0 ? 
          Prisma.raw(additionalConditions.join(' ')) : 
          Prisma.empty
        }
      ORDER BY rank DESC
      LIMIT ${limit}
      OFFSET ${offset}
    `;

    return results;
  }

  // Get available knowledge bases for filtering
  async getKnowledgeBases(tenantId: string, nucleusId: string) {
    return await this.prisma.knowledgebase_entries.findMany({
      where: {
        tenant_id: tenantId,
        nucleus_id: nucleusId,
        status: 'published', // or whatever status indicates active
      },
      select: {
        id: true,
        knowledgebase_name: true,
        contentType: true,
        _count: {
          select: {
            kb_content: {
              where: {
                kb_blocks: {
                  some: {
                    content: {
                      not: null
                    }
                  }
                }
              }
            }
          }
        }
      }
    });
  }

  // Get available tags for filtering
  async getTags(tenantId: string, nucleusId: string) {
    return await this.prisma.tags.findMany({
      where: {
        nucleus_id: nucleusId,
        nucleuses: {
          tenant_id: tenantId
        }
      },
      select: {
        id: true,
        name: true,
        priority: true
      },
      orderBy: {
        priority: 'desc'
      }
    });
  }
}

Usage Example

// Example usage
import { PrismaClient } from '@prisma/client';
import { KnowledgeBaseSearchService } from './services/knowledgeBaseSearch';

const prisma = new PrismaClient();
const searchService = new KnowledgeBaseSearchService(prisma);

async function searchExample() {
  const results = await searchService.search(
    'machine learning algorithms',
    {
      tenantId: 'tenant123',
      nucleusId: 'nucleus456',
      contentTypes: ['text', 'document'], // Optional
      tags: ['AI', 'Technology'] // Optional
    },
    { limit: 20, offset: 0 }
  );

  console.log('Search results:', results);
}

// Get available filters
async function getFilters() {
  const knowledgeBases = await searchService.getKnowledgeBases('tenant123', 'nucleus456');
  const tags = await searchService.getTags('tenant123', 'nucleus456');
  
  return { knowledgeBases, tags };
}

That's it! These minimal changes will give you:

✅ Full-text search with PostgreSQL's native capabilities
✅ Proper filtering by tenant_id and nucleus_id
✅ Additional filters for knowledge bases, content types, and tags
✅ Automatic search index updates via database triggers
✅ Type-safe Prisma integration

The search will be fast and scalable, leveraging PostgreSQL's optimized full-text search engine while working seamlessly with your existing Prisma setup.

Raw

migration-guide.md

Absolutely! Here's how to run custom SQL migrations with Prisma:

Method 1: Create Empty Migration + Add SQL (Recommended)

Step 1: Create an empty migration

npx prisma migrate dev --create-only --name add_fulltext_search

This creates a new migration file in prisma/migrations/ with a timestamp and name like:

prisma/migrations/20250903123456_add_fulltext_search/migration.sql

Step 2: Edit the generated migration file

Open the generated migration.sql file and replace its contents with:

-- Add tsvector column
ALTER TABLE kb_blocks 
ADD COLUMN IF NOT EXISTS content_search tsvector;

-- Create GIN index for fast full-text searching
CREATE INDEX IF NOT EXISTS idx_kb_blocks_content_search 
ON kb_blocks USING GIN(content_search);

-- Function to update search vector
CREATE OR REPLACE FUNCTION update_kb_content_search()
RETURNS TRIGGER AS $$
BEGIN
    IF NEW.content IS NOT NULL THEN
        NEW.content_search = to_tsvector('english', NEW.content);
    ELSE
        NEW.content_search = NULL;
    END IF;
    RETURN NEW;
END;
$$ LANGUAGE plpgsql;

-- Trigger to auto-update search vector on insert/update
DROP TRIGGER IF EXISTS trigger_update_kb_content_search ON kb_blocks;
CREATE TRIGGER trigger_update_kb_content_search
    BEFORE INSERT OR UPDATE ON kb_blocks
    FOR EACH ROW EXECUTE FUNCTION update_kb_content_search();

-- Update existing records
UPDATE kb_blocks 
SET content_search = to_tsvector('english', content)
WHERE content IS NOT NULL AND content_search IS NULL;

Step 3: Apply the migration

npx prisma migrate dev

Step 4: Update your Prisma schema

Add the content_search field to your kb_blocks model as shown in my previous response, then run:

npx prisma db pull
# or
npx prisma generate

Method 2: Direct SQL Execution (Alternative)

If you prefer to run SQL directly first, then sync the schema:

Step 1: Run SQL directly

# Connect to your database and run the SQL
psql $DATABASE_URL -f fulltext_search.sql

# Or using Prisma's db execute
npx prisma db execute --file fulltext_search.sql --schema prisma/schema.prisma

Step 2: Pull changes into schema

npx prisma db pull

Step 3: Create migration from the changes

npx prisma migrate dev --name add_fulltext_search

Method 3: Using Prisma Studio/DB Execute (For one-time scripts)

Create a file scripts/setup-fulltext.sql with the SQL content, then:

npx prisma db execute --file scripts/setup-fulltext.sql

Complete Workflow (Recommended Approach)

Here's the complete step-by-step process I recommend:

# 1. Create empty migration
npx prisma migrate dev --create-only --name add_fulltext_search

# 2. Edit the generated migration file (add the SQL content)
# 3. Update your schema.prisma (add content_search field and index)

# 4. Apply migration
npx prisma migrate dev

# 5. Generate Prisma client with new schema
npx prisma generate

For Production Deployment

When deploying to production:

# Apply pending migrations
npx prisma migrate deploy

Verification

After running the migration, verify it worked:

-- Check if column exists
SELECT column_name, data_type 
FROM information_schema.columns 
WHERE table_name = 'kb_blocks' AND column_name = 'content_search';

-- Check if index exists
SELECT indexname FROM pg_indexes 
WHERE tablename = 'kb_blocks' AND indexname = 'idx_kb_blocks_content_search';

-- Check if trigger exists
SELECT trigger_name FROM information_schema.triggers 
WHERE event_object_table = 'kb_blocks';

-- Test the search functionality
SELECT content, content_search 
FROM kb_blocks 
WHERE content_search @@ plainto_tsquery('english', 'test') 
LIMIT 5;

Troubleshooting Tips

If migration fails: Check PostgreSQL logs for specific errors
If schema drift occurs: Run npx prisma db pull to sync

If you need to rollback:

# Reset to previous migration
npx prisma migrate reset

For existing data: The migration includes updating existing records, but for large datasets, you might want to do this in batches:

-- Update in batches of 1000
UPDATE kb_blocks 
SET content_search = to_tsvector('english', content)
WHERE content IS NOT NULL 
AND content_search IS NULL 
AND id IN (
  SELECT id FROM kb_blocks 
  WHERE content_search IS NULL 
  LIMIT 1000
);

The Method 1 approach is recommended because it keeps everything in Prisma's migration history, making it easier to track changes and deploy consistently across environments.

Raw

ranking-threshold.md

Yes, absolutely! You can add similarity/rank thresholds with PostgreSQL's full-text search. Here are several approaches:

1. Simple Rank Threshold

Add a HAVING clause to filter by minimum rank:

SELECT 
    kb.id as "blockId",
    kb.content,
    ts_rank(kb.content_search, plainto_tsquery('english', ${query})) as rank,
    kc.id as "kbContentId",
    ke.id as "knowledgebaseId",
    ke.knowledgebase_name as "knowledgebaseName"
FROM kb_blocks kb
JOIN kb_content kc ON kb.kb_content_id = kc.id
JOIN knowledgebase_entries ke ON kc.knowledgebase_id = ke.id
WHERE 
    ke.tenant_id = ${filters.tenantId}
    AND ke.nucleus_id = ${filters.nucleusId}
    AND kb.content_search @@ plainto_tsquery('english', ${query})
    AND kb.content IS NOT NULL
    ${additionalConditions.length > 0 ? 
      Prisma.raw(additionalConditions.join(' ')) : 
      Prisma.empty
    }
HAVING ts_rank(kb.content_search, plainto_tsquery('english', ${query})) > 0.1
ORDER BY rank DESC
LIMIT ${limit}
OFFSET ${offset}

2. Normalized Rank with Threshold

Use ts_rank_cd with normalization for better threshold control:

SELECT 
    kb.id as "blockId",
    kb.content,
    ts_rank_cd(kb.content_search, plainto_tsquery('english', ${query}), 1) as rank,
    -- ... other fields
FROM kb_blocks kb
-- ... joins
WHERE 
    -- ... existing conditions
    AND ts_rank_cd(kb.content_search, plainto_tsquery('english', ${query}), 1) > 0.05
ORDER BY rank DESC
LIMIT ${limit}
OFFSET ${offset}

3. Similarity Score with Custom Weighting

For more BM25-like behavior, use custom weights:

SELECT 
    kb.id as "blockId",
    kb.content,
    ts_rank_cd(
        kb.content_search, 
        plainto_tsquery('english', ${query}),
        2  -- normalization flag for document length
    ) as rank,
    -- ... other fields
FROM kb_blocks kb
-- ... joins  
WHERE 
    -- ... existing conditions
    AND ts_rank_cd(kb.content_search, plainto_tsquery('english', ${query}), 2) > ${minSimilarity}
ORDER BY rank DESC
LIMIT ${limit}
OFFSET ${offset}

4. Dynamic Threshold Based on Query

You can make the threshold dynamic:

WITH ranked_results AS (
    SELECT 
        kb.id as "blockId",
        kb.content,
        ts_rank_cd(kb.content_search, plainto_tsquery('english', ${query}), 1) as rank,
        kc.id as "kbContentId",
        ke.id as "knowledgebaseId",
        ke.knowledgebase_name as "knowledgebaseName"
    FROM kb_blocks kb
    JOIN kb_content kc ON kb.kb_content_id = kc.id
    JOIN knowledgebase_entries ke ON kc.knowledgebase_id = ke.id
    WHERE 
        ke.tenant_id = ${filters.tenantId}
        AND ke.nucleus_id = ${filters.nucleusId}
        AND kb.content_search @@ plainto_tsquery('english', ${query})
        AND kb.content IS NOT NULL
)
SELECT * FROM ranked_results 
WHERE rank > (SELECT AVG(rank) * 0.5 FROM ranked_results)  -- 50% of average
ORDER BY rank DESC
LIMIT ${limit}
OFFSET ${offset}

5. Recommended Approach for Your Use Case

SELECT 
    kb.id as "blockId",
    kb.content,
    ts_rank_cd(
        kb.content_search, 
        plainto_tsquery('english', ${query}),
        1 | 4  -- normalize by doc length + log(unique words)
    ) as rank,
    kc.id as "kbContentId",
    ke.id as "knowledgebaseId",
    ke.knowledgebase_name as "knowledgebaseName"
FROM kb_blocks kb
JOIN kb_content kc ON kb.kb_content_id = kc.id
JOIN knowledgebase_entries ke ON kc.knowledgebase_id = ke.id
WHERE 
    ke.tenant_id = ${filters.tenantId}
    AND ke.nucleus_id = ${filters.nucleusId}
    AND kb.content_search @@ plainto_tsquery('english', ${query})
    AND kb.content IS NOT NULL
    AND ts_rank_cd(kb.content_search, plainto_tsquery('english', ${query}), 1 | 4) > ${minSimilarity || 0.01}
    ${additionalConditions.length > 0 ? 
      Prisma.raw(additionalConditions.join(' ')) : 
      Prisma.empty
    }
ORDER BY rank DESC
LIMIT ${limit}
OFFSET ${offset}

Threshold Guidelines:

0.001-0.01: Very loose matching
0.01-0.05: Moderate relevance
0.05-0.1: Good relevance
0.1+: High relevance

The exact threshold depends on your content and requirements. I'd recommend starting with 0.01 and adjusting based on your results quality! 🎯