Skip to content

Instantly share code, notes, and snippets.

@liling
Last active May 5, 2026 15:42
Show Gist options
  • Select an option

  • Save liling/21b3cbe7ee51182f5e0aa18fa8fbc6d0 to your computer and use it in GitHub Desktop.

Select an option

Save liling/21b3cbe7ee51182f5e0aa18fa8fbc6d0 to your computer and use it in GitHub Desktop.
Switch HindSight 0.5.6 text search from native (tsvector) to vchord (bm25)
-- Switch text search from native (tsvector) to vchord (bm25)
--
-- Prerequisites:
-- 1. Add to postgresql.conf:
-- shared_preload_libraries = 'vchord,vchord_bm25,pg_tokenizer'
-- 2. Restart PostgreSQL
-- 3. Run this script: psql -d hindsight_dev -f switch_native_to_vchord.sql
--
-- This script is idempotent — safe to run multiple times.
\echo 'Switching text search: native → vchord'
-- Step 1: Install extensions
CREATE EXTENSION IF NOT EXISTS vchord CASCADE;
CREATE EXTENSION IF NOT EXISTS pg_tokenizer CASCADE;
CREATE EXTENSION IF NOT EXISTS vchord_bm25 CASCADE;
-- Step 2: Create tokenizer (needed by INSERT queries)
-- NOTE: config is TOML — values must be quoted: key = "value"
DO $$
BEGIN
IF NOT EXISTS (SELECT 1 FROM tokenizer_catalog.tokenizer WHERE name = 'llmlingua2') THEN
PERFORM tokenizer_catalog.create_tokenizer('llmlingua2', 'model = "llmlingua2"');
RAISE NOTICE 'Created llmlingua2 tokenizer';
END IF;
END $$;
-- Step 3: Set search_path so bare tokenize() / to_bm25query() calls resolve correctly.
-- Application code (ops_postgresql.py, consolidation, migrations) uses unqualified
-- tokenize() which requires bm25_catalog and tokenizer_catalog on search_path.
SELECT current_database() AS db \gset
ALTER DATABASE :db SET search_path TO bm25_catalog, tokenizer_catalog, "$user", public;
-- Step 4: Rebuild search_vector on memory_units
DO $$
BEGIN
-- Drop existing GIN index
DROP INDEX IF EXISTS idx_memory_units_text_search;
-- Drop existing tsvector column (GENERATED columns can't be altered, must drop)
ALTER TABLE memory_units DROP COLUMN IF EXISTS search_vector;
-- Add bm25vector column
ALTER TABLE memory_units ADD COLUMN search_vector bm25_catalog.bm25vector;
-- Create BM25 index
CREATE INDEX idx_memory_units_text_search
ON memory_units USING bm25 (search_vector bm25_catalog.bm25_ops);
-- Backfill existing rows
UPDATE memory_units
SET search_vector = tokenizer_catalog.tokenize(
COALESCE(text, '') || ' ' || COALESCE(context, ''),
'llmlingua2'
)::bm25_catalog.bm25vector
WHERE search_vector IS NULL;
RAISE NOTICE 'memory_units: done';
END $$;
-- Step 4: Rebuild search_vector on reflections (if table exists)
DO $$
BEGIN
IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'reflections') THEN
DROP INDEX IF EXISTS idx_reflections_text_search;
ALTER TABLE reflections DROP COLUMN IF EXISTS search_vector;
ALTER TABLE reflections ADD COLUMN search_vector bm25_catalog.bm25vector;
CREATE INDEX idx_reflections_text_search
ON reflections USING bm25 (search_vector bm25_catalog.bm25_ops);
UPDATE reflections
SET search_vector = tokenizer_catalog.tokenize(
COALESCE(name, '') || ' ' || COALESCE(content, ''),
'llmlingua2'
)::bm25_catalog.bm25vector
WHERE search_vector IS NULL;
RAISE NOTICE 'reflections: done';
ELSE
RAISE NOTICE 'reflections: table not found, skipping';
END IF;
END $$;
\echo 'Done. Update HINDSIGHT_API_TEXT_SEARCH_EXTENSION=vchord in .env and restart the app.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment