Skip to content

Instantly share code, notes, and snippets.

@adriangb
Last active September 28, 2024 15:49
Show Gist options
  • Save adriangb/1b68bb2e408423ddcb90fb0136a00ba8 to your computer and use it in GitHub Desktop.
Save adriangb/1b68bb2e408423ddcb90fb0136a00ba8 to your computer and use it in GitHub Desktop.
Faster array intersections in Postgres using exponential search
CREATE OR REPLACE FUNCTION has_overlap(arr1 text[], arr2 text[])
RETURNS boolean AS $$
DECLARE
i integer := 1;
j integer := 1;
n1 integer;
n2 integer;
x text;
BEGIN
-- Ensure arr1 is the shorter array
IF array_length(arr1, 1) > array_length(arr2, 1) THEN
SELECT arr2, arr1 INTO arr1, arr2;
END IF;
n1 := array_length(arr1, 1);
n2 := array_length(arr2, 1);
IF n1 IS NULL OR n2 IS NULL THEN
RETURN FALSE;
END IF;
WHILE i <= n1 AND j <= n2 LOOP
x := arr1[i];
-- Skip elements in arr2 that are less than x
WHILE j <= n2 AND arr2[j] < x LOOP
j := j + 1;
END LOOP;
-- Check if we found a match
IF j <= n2 AND arr2[j] = x THEN
RETURN TRUE;
END IF;
i := i + 1;
END LOOP;
RETURN FALSE;
END;
$$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE;
-- Create a temporary table to store our test cases
CREATE TEMPORARY TABLE overlap_test_cases (
id SERIAL PRIMARY KEY,
description TEXT,
arr1 TEXT[],
arr2 TEXT[],
expected_result BOOLEAN
);
-- Insert test cases
INSERT INTO overlap_test_cases (description, arr1, arr2, expected_result) VALUES
('Basic overlap at end', ARRAY['a', 'b', 'c'], ARRAY['c', 'd', 'e'], TRUE),
('No overlap', ARRAY['a', 'b', 'c'], ARRAY['d', 'e', 'f'], FALSE),
('Overlap in middle', ARRAY['a', 'b', 'c'], ARRAY['b', 'c', 'd'], TRUE),
('Empty first array', ARRAY[]::TEXT[], ARRAY['a', 'b', 'c'], FALSE),
('Empty second array', ARRAY['a', 'b', 'c'], ARRAY[]::TEXT[], FALSE),
('Both arrays empty', ARRAY[]::TEXT[], ARRAY[]::TEXT[], FALSE),
('Identical arrays', ARRAY['a', 'b', 'c'], ARRAY['a', 'b', 'c'], TRUE),
('Overlap at start', ARRAY['a', 'b', 'c'], ARRAY['a', 'd', 'e'], TRUE),
('Single element arrays with overlap', ARRAY['a'], ARRAY['a'], TRUE),
('Single element arrays without overlap', ARRAY['a'], ARRAY['b'], FALSE),
('First array is prefix of second', ARRAY['a', 'b'], ARRAY['a', 'b', 'c', 'd'], TRUE),
('Second array is prefix of first', ARRAY['a', 'b', 'c', 'd'], ARRAY['a', 'b'], TRUE),
('Overlap with repeated elements', ARRAY['a', 'b', 'b', 'c'], ARRAY['b', 'b', 'c', 'd'], TRUE),
('No overlap with repeated elements', ARRAY['a', 'a', 'b', 'b'], ARRAY['c', 'c', 'd', 'd'], FALSE),
('Unicode character overlap', ARRAY['α', 'β', 'γ'], ARRAY['γ', 'δ', 'ε'], TRUE),
('Case sensitive no overlap', ARRAY['A', 'B', 'C'], ARRAY['a', 'b', 'c'], FALSE),
('Overlap with spaces', ARRAY['a b', 'c d'], ARRAY['c d', 'e f'], TRUE),
('Very large arrays with overlap at end',
ARRAY(SELECT chr((i % 26) + 65) FROM generate_series(1, 1000) i),
ARRAY(SELECT chr((i % 26) + 65) FROM generate_series(1000, 2000) i),
TRUE);
-- Run test cases
SELECT
tc.id,
tc.description,
tc.expected_result,
has_overlap(tc.arr1, tc.arr2) AS actual_result,
CASE
WHEN tc.expected_result = has_overlap(tc.arr1, tc.arr2) THEN 'PASS'
ELSE 'FAIL'
END AS test_result
FROM overlap_test_cases tc
ORDER BY tc.id;
-- Clean up
DROP TABLE overlap_test_cases;
DROP SCHEMA IF EXISTS test CASCADE;
CREATE SCHEMA test;
CREATE TABLE test.test (
x text[]
);
-- insert 100k rows with 1k random elements each array
-- do it in a loop in a function to avoid running out of memory
DO $$
DECLARE
i int;
BEGIN
FOR i IN 1..100000 LOOP
INSERT INTO test.test (x)
SELECT array_sort(array_agg(md5(random()::text)))
FROM generate_series(1, 1000) array_gen;
COMMIT;
END LOOP;
END $$;
-- now there are 100M unique trace IDs in the table (100k rows * 1k elements each)
-- insert a single known row
INSERT INTO test.test (x)
VALUES (ARRAY['2094799883a44c31f62965e248f61681']);
-- https://app.pgmustard.com/#/explore/c0549e1b-ea38-43f6-8e52-77b2f7f57ed2
explain (analyze, buffers, verbose, settings, format json)
SELECT count(*)
FROM test.test
WHERE x && ARRAY['2094799883a44c31f62965e248f61681'];
-- https://app.pgmustard.com/#/explore/75d07543-e90f-4bd2-ba7a-3042e4425acc
explain (analyze, buffers, verbose, settings, format json)
SELECT count(*)
FROM test.test
WHERE has_overlap(x, ARRAY['2094799883a44c31f62965e248f61681']);
-- https://app.pgmustard.com/#/explore/6a90a30a-2e75-48fc-b9fb-530fa0b93f4d
explain (analyze, buffers, verbose, settings, format json)
SELECT count(*)
FROM test.test
WHERE x && (SELECT x[500:600] FROM test.test LIMIT 1);
-- https://app.pgmustard.com/#/explore/d4c355dd-34e7-4d1f-a11f-cdd9b123186c
explain (analyze, buffers, verbose, settings, format json)
SELECT count(*)
FROM test.test
WHERE has_overlap(x, (SELECT x[500:600] FROM test.test LIMIT 1));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment