Last active
May 5, 2026 15:42
-
-
Save liling/21b3cbe7ee51182f5e0aa18fa8fbc6d0 to your computer and use it in GitHub Desktop.
Switch HindSight 0.5.6 text search from native (tsvector) to vchord (bm25)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -- Switch text search from native (tsvector) to vchord (bm25) | |
| -- | |
| -- Prerequisites: | |
| -- 1. Add to postgresql.conf: | |
| -- shared_preload_libraries = 'vchord,vchord_bm25,pg_tokenizer' | |
| -- 2. Restart PostgreSQL | |
| -- 3. Run this script: psql -d hindsight_dev -f switch_native_to_vchord.sql | |
| -- | |
| -- This script is idempotent — safe to run multiple times. | |
| \echo 'Switching text search: native → vchord' | |
| -- Step 1: Install extensions | |
| CREATE EXTENSION IF NOT EXISTS vchord CASCADE; | |
| CREATE EXTENSION IF NOT EXISTS pg_tokenizer CASCADE; | |
| CREATE EXTENSION IF NOT EXISTS vchord_bm25 CASCADE; | |
| -- Step 2: Create tokenizer (needed by INSERT queries) | |
| -- NOTE: config is TOML — values must be quoted: key = "value" | |
| DO $$ | |
| BEGIN | |
| IF NOT EXISTS (SELECT 1 FROM tokenizer_catalog.tokenizer WHERE name = 'llmlingua2') THEN | |
| PERFORM tokenizer_catalog.create_tokenizer('llmlingua2', 'model = "llmlingua2"'); | |
| RAISE NOTICE 'Created llmlingua2 tokenizer'; | |
| END IF; | |
| END $$; | |
| -- Step 3: Set search_path so bare tokenize() / to_bm25query() calls resolve correctly. | |
| -- Application code (ops_postgresql.py, consolidation, migrations) uses unqualified | |
| -- tokenize() which requires bm25_catalog and tokenizer_catalog on search_path. | |
| SELECT current_database() AS db \gset | |
| ALTER DATABASE :db SET search_path TO bm25_catalog, tokenizer_catalog, "$user", public; | |
| -- Step 4: Rebuild search_vector on memory_units | |
| DO $$ | |
| BEGIN | |
| -- Drop existing GIN index | |
| DROP INDEX IF EXISTS idx_memory_units_text_search; | |
| -- Drop existing tsvector column (GENERATED columns can't be altered, must drop) | |
| ALTER TABLE memory_units DROP COLUMN IF EXISTS search_vector; | |
| -- Add bm25vector column | |
| ALTER TABLE memory_units ADD COLUMN search_vector bm25_catalog.bm25vector; | |
| -- Create BM25 index | |
| CREATE INDEX idx_memory_units_text_search | |
| ON memory_units USING bm25 (search_vector bm25_catalog.bm25_ops); | |
| -- Backfill existing rows | |
| UPDATE memory_units | |
| SET search_vector = tokenizer_catalog.tokenize( | |
| COALESCE(text, '') || ' ' || COALESCE(context, ''), | |
| 'llmlingua2' | |
| )::bm25_catalog.bm25vector | |
| WHERE search_vector IS NULL; | |
| RAISE NOTICE 'memory_units: done'; | |
| END $$; | |
| -- Step 4: Rebuild search_vector on reflections (if table exists) | |
| DO $$ | |
| BEGIN | |
| IF EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'reflections') THEN | |
| DROP INDEX IF EXISTS idx_reflections_text_search; | |
| ALTER TABLE reflections DROP COLUMN IF EXISTS search_vector; | |
| ALTER TABLE reflections ADD COLUMN search_vector bm25_catalog.bm25vector; | |
| CREATE INDEX idx_reflections_text_search | |
| ON reflections USING bm25 (search_vector bm25_catalog.bm25_ops); | |
| UPDATE reflections | |
| SET search_vector = tokenizer_catalog.tokenize( | |
| COALESCE(name, '') || ' ' || COALESCE(content, ''), | |
| 'llmlingua2' | |
| )::bm25_catalog.bm25vector | |
| WHERE search_vector IS NULL; | |
| RAISE NOTICE 'reflections: done'; | |
| ELSE | |
| RAISE NOTICE 'reflections: table not found, skipping'; | |
| END IF; | |
| END $$; | |
| \echo 'Done. Update HINDSIGHT_API_TEXT_SEARCH_EXTENSION=vchord in .env and restart the app.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment