adriangb · September 28, 2024 15:49
diff --git a/has_overlap.sql b/has_overlap.sql
 CREATE OR REPLACE FUNCTION has_overlap(arr1 text[], arr2 text[])
 RETURNS boolean AS $$
 DECLARE
    i integer := 1;
    j integer := 1;
    n1 integer;
    n2 integer;
    x text;
 BEGIN
    -- Ensure arr1 is the shorter array
    IF array_length(arr1, 1) > array_length(arr2, 1) THEN
        SELECT arr2, arr1 INTO arr1, arr2;
    END IF;

    n1 := array_length(arr1, 1);
    n2 := array_length(arr2, 1);

    IF n1 IS NULL OR n2 IS NULL THEN
        RETURN FALSE;
    END IF;

    WHILE i <= n1 AND j <= n2 LOOP
        x := arr1[i];
        
        -- Skip elements in arr2 that are less than x
        WHILE j <= n2 AND arr2[j] < x LOOP
            j := j + 1;
        END LOOP;
        
        -- Check if we found a match
        IF j <= n2 AND arr2[j] = x THEN
            RETURN TRUE;
        END IF;
        
        i := i + 1;
    END LOOP;

    RETURN FALSE;
 END;
 $$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;

 -- Create a temporary table to store our test cases
 CREATE TEMPORARY TABLE overlap_test_cases (
    id SERIAL PRIMARY KEY,
    description TEXT,
    arr1 TEXT[],
    arr2 TEXT[],
    expected_result BOOLEAN
 );

 -- Insert test cases
 INSERT INTO overlap_test_cases (description, arr1, arr2, expected_result) VALUES
    ('Basic overlap at end', ARRAY['a', 'b', 'c'], ARRAY['c', 'd', 'e'], TRUE),
    ('No overlap', ARRAY['a', 'b', 'c'], ARRAY['d', 'e', 'f'], FALSE),
    ('Overlap in middle', ARRAY['a', 'b', 'c'], ARRAY['b', 'c', 'd'], TRUE),
    ('Empty first array', ARRAY[]::TEXT[], ARRAY['a', 'b', 'c'], FALSE),
    ('Empty second array', ARRAY['a', 'b', 'c'], ARRAY[]::TEXT[], FALSE),
    ('Both arrays empty', ARRAY[]::TEXT[], ARRAY[]::TEXT[], FALSE),
    ('Identical arrays', ARRAY['a', 'b', 'c'], ARRAY['a', 'b', 'c'], TRUE),
    ('Overlap at start', ARRAY['a', 'b', 'c'], ARRAY['a', 'd', 'e'], TRUE),
    ('Single element arrays with overlap', ARRAY['a'], ARRAY['a'], TRUE),
    ('Single element arrays without overlap', ARRAY['a'], ARRAY['b'], FALSE),
    ('First array is prefix of second', ARRAY['a', 'b'], ARRAY['a', 'b', 'c', 'd'], TRUE),
    ('Second array is prefix of first', ARRAY['a', 'b', 'c', 'd'], ARRAY['a', 'b'], TRUE),
    ('Overlap with repeated elements', ARRAY['a', 'b', 'b', 'c'], ARRAY['b', 'b', 'c', 'd'], TRUE),
    ('No overlap with repeated elements', ARRAY['a', 'a', 'b', 'b'], ARRAY['c', 'c', 'd', 'd'], FALSE),
    ('Unicode character overlap', ARRAY['α', 'β', 'γ'], ARRAY['γ', 'δ', 'ε'], TRUE),
    ('Case sensitive no overlap', ARRAY['A', 'B', 'C'], ARRAY['a', 'b', 'c'], FALSE),
    ('Overlap with spaces', ARRAY['a b', 'c d'], ARRAY['c d', 'e f'], TRUE),
    ('Very large arrays with overlap at end', 
     ARRAY(SELECT chr((i % 26) + 65) FROM generate_series(1, 1000) i),
     ARRAY(SELECT chr((i % 26) + 65) FROM generate_series(1000, 2000) i),
     TRUE);

 -- Run test cases
 SELECT 
    tc.id,
    tc.description,
    tc.expected_result,
    has_overlap(tc.arr1, tc.arr2) AS actual_result,
    CASE 
        WHEN tc.expected_result = has_overlap(tc.arr1, tc.arr2) THEN 'PASS'
        ELSE 'FAIL'
    END AS test_result
 FROM overlap_test_cases tc
 ORDER BY tc.id;

 -- Clean up
 DROP TABLE overlap_test_cases;

 DROP SCHEMA IF EXISTS test CASCADE;

 CREATE SCHEMA test;

 CREATE TABLE test.test (
    x text[]
 );

 -- insert 100k rows with 1k random elements each array
 -- do it in a loop in a function to avoid running out of memory
 DO $$
 DECLARE
    i int;
 BEGIN
    FOR i IN 1..100000 LOOP
        INSERT INTO test.test (x)
        SELECT array_sort(array_agg(md5(random()::text)))
        FROM generate_series(1, 1000) array_gen;
        COMMIT;
    END LOOP;
 END $$;

 -- now there are 100M unique trace IDs in the table (100k rows * 1k elements each)

 -- insert a single known row
 INSERT INTO test.test (x)
 VALUES (ARRAY['2094799883a44c31f62965e248f61681']);

 -- https://app.pgmustard.com/#/explore/c0549e1b-ea38-43f6-8e52-77b2f7f57ed2
 explain (analyze, buffers, verbose, settings, format json)
 SELECT count(*)
 FROM test.test
 WHERE x && ARRAY['2094799883a44c31f62965e248f61681'];

 -- https://app.pgmustard.com/#/explore/75d07543-e90f-4bd2-ba7a-3042e4425acc
 explain (analyze, buffers, verbose, settings, format json)
 SELECT count(*)
 FROM test.test
 WHERE has_overlap(x, ARRAY['2094799883a44c31f62965e248f61681']);

 -- https://app.pgmustard.com/#/explore/6a90a30a-2e75-48fc-b9fb-530fa0b93f4d
 explain (analyze, buffers, verbose, settings, format json)
 SELECT count(*)
 FROM test.test
 WHERE x && (SELECT x[500:600] FROM test.test LIMIT 1);

 -- https://app.pgmustard.com/#/explore/d4c355dd-34e7-4d1f-a11f-cdd9b123186c
 explain (analyze, buffers, verbose, settings, format json)
 SELECT count(*)
 FROM test.test
 WHERE has_overlap(x, (SELECT x[500:600] FROM test.test LIMIT 1));
	CREATE OR REPLACE FUNCTION has_overlap(arr1 text[], arr2 text[])
	RETURNS boolean AS $$
	DECLARE
	i integer := 1;
	j integer := 1;
	n1 integer;
	n2 integer;
	x text;
	BEGIN
	-- Ensure arr1 is the shorter array
	IF array_length(arr1, 1) > array_length(arr2, 1) THEN
	SELECT arr2, arr1 INTO arr1, arr2;
	END IF;

	n1 := array_length(arr1, 1);
	n2 := array_length(arr2, 1);

	IF n1 IS NULL OR n2 IS NULL THEN
	RETURN FALSE;
	END IF;

	WHILE i <= n1 AND j <= n2 LOOP
	x := arr1[i];

	-- Skip elements in arr2 that are less than x
	WHILE j <= n2 AND arr2[j] < x LOOP
	j := j + 1;
	END LOOP;

	-- Check if we found a match
	IF j <= n2 AND arr2[j] = x THEN
	RETURN TRUE;
	END IF;

	i := i + 1;
	END LOOP;

	RETURN FALSE;
	END;
	$$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;

	-- Create a temporary table to store our test cases
	CREATE TEMPORARY TABLE overlap_test_cases (
	id SERIAL PRIMARY KEY,
	description TEXT,
	arr1 TEXT[],
	arr2 TEXT[],
	expected_result BOOLEAN
	);

	-- Insert test cases
	INSERT INTO overlap_test_cases (description, arr1, arr2, expected_result) VALUES
	('Basic overlap at end', ARRAY['a', 'b', 'c'], ARRAY['c', 'd', 'e'], TRUE),
	('No overlap', ARRAY['a', 'b', 'c'], ARRAY['d', 'e', 'f'], FALSE),
	('Overlap in middle', ARRAY['a', 'b', 'c'], ARRAY['b', 'c', 'd'], TRUE),
	('Empty first array', ARRAY[]::TEXT[], ARRAY['a', 'b', 'c'], FALSE),
	('Empty second array', ARRAY['a', 'b', 'c'], ARRAY[]::TEXT[], FALSE),
	('Both arrays empty', ARRAY[]::TEXT[], ARRAY[]::TEXT[], FALSE),
	('Identical arrays', ARRAY['a', 'b', 'c'], ARRAY['a', 'b', 'c'], TRUE),
	('Overlap at start', ARRAY['a', 'b', 'c'], ARRAY['a', 'd', 'e'], TRUE),
	('Single element arrays with overlap', ARRAY['a'], ARRAY['a'], TRUE),
	('Single element arrays without overlap', ARRAY['a'], ARRAY['b'], FALSE),
	('First array is prefix of second', ARRAY['a', 'b'], ARRAY['a', 'b', 'c', 'd'], TRUE),
	('Second array is prefix of first', ARRAY['a', 'b', 'c', 'd'], ARRAY['a', 'b'], TRUE),
	('Overlap with repeated elements', ARRAY['a', 'b', 'b', 'c'], ARRAY['b', 'b', 'c', 'd'], TRUE),
	('No overlap with repeated elements', ARRAY['a', 'a', 'b', 'b'], ARRAY['c', 'c', 'd', 'd'], FALSE),
	('Unicode character overlap', ARRAY['α', 'β', 'γ'], ARRAY['γ', 'δ', 'ε'], TRUE),
	('Case sensitive no overlap', ARRAY['A', 'B', 'C'], ARRAY['a', 'b', 'c'], FALSE),
	('Overlap with spaces', ARRAY['a b', 'c d'], ARRAY['c d', 'e f'], TRUE),
	('Very large arrays with overlap at end',
	ARRAY(SELECT chr((i % 26) + 65) FROM generate_series(1, 1000) i),
	ARRAY(SELECT chr((i % 26) + 65) FROM generate_series(1000, 2000) i),
	TRUE);

	-- Run test cases
	SELECT
	tc.id,
	tc.description,
	tc.expected_result,
	has_overlap(tc.arr1, tc.arr2) AS actual_result,
	CASE
	WHEN tc.expected_result = has_overlap(tc.arr1, tc.arr2) THEN 'PASS'
	ELSE 'FAIL'
	END AS test_result
	FROM overlap_test_cases tc
	ORDER BY tc.id;

	-- Clean up
	DROP TABLE overlap_test_cases;

	DROP SCHEMA IF EXISTS test CASCADE;

	CREATE SCHEMA test;

	CREATE TABLE test.test (
	x text[]
	);

	-- insert 100k rows with 1k random elements each array
	-- do it in a loop in a function to avoid running out of memory
	DO $$
	DECLARE
	i int;
	BEGIN
	FOR i IN 1..100000 LOOP
	INSERT INTO test.test (x)
	SELECT array_sort(array_agg(md5(random()::text)))
	FROM generate_series(1, 1000) array_gen;
	COMMIT;
	END LOOP;
	END $$;

	-- now there are 100M unique trace IDs in the table (100k rows * 1k elements each)

	-- insert a single known row
	INSERT INTO test.test (x)
	VALUES (ARRAY['2094799883a44c31f62965e248f61681']);

	-- https://app.pgmustard.com/#/explore/c0549e1b-ea38-43f6-8e52-77b2f7f57ed2
	explain (analyze, buffers, verbose, settings, format json)
	SELECT count(*)
	FROM test.test
	WHERE x && ARRAY['2094799883a44c31f62965e248f61681'];

	-- https://app.pgmustard.com/#/explore/75d07543-e90f-4bd2-ba7a-3042e4425acc
	explain (analyze, buffers, verbose, settings, format json)
	SELECT count(*)
	FROM test.test
	WHERE has_overlap(x, ARRAY['2094799883a44c31f62965e248f61681']);

	-- https://app.pgmustard.com/#/explore/6a90a30a-2e75-48fc-b9fb-530fa0b93f4d
	explain (analyze, buffers, verbose, settings, format json)
	SELECT count(*)
	FROM test.test
	WHERE x && (SELECT x[500:600] FROM test.test LIMIT 1);

	-- https://app.pgmustard.com/#/explore/d4c355dd-34e7-4d1f-a11f-cdd9b123186c
	explain (analyze, buffers, verbose, settings, format json)
	SELECT count(*)
	FROM test.test
	WHERE has_overlap(x, (SELECT x[500:600] FROM test.test LIMIT 1));