Skip to content

Instantly share code, notes, and snippets.

@louismullie
Last active November 29, 2023 23:53

Revisions

  1. louismullie revised this gist Nov 29, 2023. 1 changed file with 8 additions and 5 deletions.
    13 changes: 8 additions & 5 deletions search.sql
    Original file line number Diff line number Diff line change
    @@ -39,6 +39,7 @@ BEGIN
    RETURN
    END

    -- Create tables
    CREATE TABLE synonyms (
    term VARCHAR(255),
    longest_form VARCHAR(255)
    @@ -49,6 +50,9 @@ CREATE TABLE abbreviations (
    full_form VARCHAR(255)
    );

    CREATE VIRTUAL TABLE documents USING fts5(title, body);

    -- Example data
    INSERT INTO synonyms (term, longest_form) VALUES
    ('anca', 'antineutrophil cytoplasmic antibodies'),
    ('anca associated vasculitis', 'antineutrophil cytoplasmic antibodies associated vasculitis');
    @@ -57,7 +61,9 @@ INSERT INTO abbreviations (abbreviation, full_form) VALUES
    ('anca', 'antineutrophil cytoplasmic antibodies'),
    ('aav', 'anca associated vasculitis');

    -- Assuming you have an input query
    INSERT INTO documents (title, body) VALUES ('Example Title', 'Example body text');

    -- Declare input query
    DECLARE @input_query VARCHAR(255);
    SET @input_query = 'your input query here';

    @@ -108,9 +114,6 @@ SELECT * FROM contracted_query
    UNION
    SELECT * FROM nested_contracted_query) as search_queries;

    -- Step 7: Query Documents
    CREATE VIRTUAL TABLE documents USING fts5(title, body);
    INSERT INTO documents (title, body) VALUES ('Example Title', 'Example body text');

    -- Step 7: Query Documents using FTS5
    SELECT * FROM documents
    WHERE documents MATCH (group_concat(search_queries, ' OR '));
  2. louismullie created this gist Nov 29, 2023.
    116 changes: 116 additions & 0 deletions search.sql
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,116 @@
    -- Function to split the input query and generate all contiguous combinations
    CREATE FUNCTION split_to_table(@input_query VARCHAR(MAX))
    RETURNS @result TABLE (combination VARCHAR(MAX))
    AS
    BEGIN
    DECLARE @words TABLE (id INT IDENTITY(1,1), word VARCHAR(255))
    DECLARE @total_words INT, @i INT, @j INT, @current_combination VARCHAR(MAX)

    -- Step 1: Splitting the input string into words
    INSERT INTO @words (word)
    SELECT value
    FROM STRING_SPLIT(@input_query, ' ') -- This function varies based on SQL dialect

    SELECT @total_words = COUNT(*) FROM @words

    -- Step 2: Generating all contiguous combinations
    SET @i = 1
    WHILE @i <= @total_words
    BEGIN
    SET @j = @i
    SET @current_combination = ''

    WHILE @j <= @total_words
    BEGIN
    SELECT @current_combination = @current_combination + ' ' + word
    FROM @words
    WHERE id = @j

    -- Insert the current combination into the result table
    INSERT INTO @result (combination)
    VALUES (LTRIM(@current_combination))

    SET @j = @j + 1
    END

    SET @i = @i + 1
    END

    RETURN
    END

    CREATE TABLE synonyms (
    term VARCHAR(255),
    longest_form VARCHAR(255)
    );

    CREATE TABLE abbreviations (
    abbreviation VARCHAR(255),
    full_form VARCHAR(255)
    );

    INSERT INTO synonyms (term, longest_form) VALUES
    ('anca', 'antineutrophil cytoplasmic antibodies'),
    ('anca associated vasculitis', 'antineutrophil cytoplasmic antibodies associated vasculitis');

    INSERT INTO abbreviations (abbreviation, full_form) VALUES
    ('anca', 'antineutrophil cytoplasmic antibodies'),
    ('aav', 'anca associated vasculitis');

    -- Assuming you have an input query
    DECLARE @input_query VARCHAR(255);
    SET @input_query = 'your input query here';

    -- Step 1: Breaking down the input query
    WITH split_terms AS (
    SELECT term
    FROM split_to_table(@input_query, ' ')
    )

    -- Step 2: Matching terms against synonyms
    , expanded_terms AS (
    SELECT s.longest_form
    FROM split_terms st
    LEFT JOIN synonyms s ON st.term = s.term
    )

    -- Step 3: Apply expansion
    , expanded_query AS (
    SELECT DISTINCT longest_form
    FROM expanded_terms
    ORDER BY LENGTH(longest_form) DESC
    )

    -- Step 4: Apply contraction
    , contracted_query AS (
    SELECT DISTINCT a.abbreviation
    FROM abbreviations a
    WHERE a.full_form IN (SELECT longest_form FROM expanded_terms)
    ORDER BY LENGTH(a.full_form) DESC
    )

    -- Step 5: Nested contraction
    , nested_contracted_query AS (
    SELECT DISTINCT a.abbreviation
    FROM abbreviations a
    WHERE a.full_form IN (
    SELECT abbreviation FROM abbreviations WHERE full_form IN (
    SELECT longest_form FROM expanded_terms
    )
    )
    ORDER BY LENGTH(a.full_form) DESC
    )

    -- Step 6: Final Combined Query
    WITH (SELECT * FROM expanded_query
    UNION
    SELECT * FROM contracted_query
    UNION
    SELECT * FROM nested_contracted_query) as search_queries;

    -- Step 7: Query Documents
    CREATE VIRTUAL TABLE documents USING fts5(title, body);
    INSERT INTO documents (title, body) VALUES ('Example Title', 'Example body text');

    SELECT * FROM documents
    WHERE documents MATCH (group_concat(search_queries, ' OR '));