Last active
September 28, 2024 15:49
-
-
Save adriangb/1b68bb2e408423ddcb90fb0136a00ba8 to your computer and use it in GitHub Desktop.
Faster array intersections in Postgres using exponential search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE OR REPLACE FUNCTION has_overlap(arr1 text[], arr2 text[]) | |
RETURNS boolean AS $$ | |
DECLARE | |
i integer := 1; | |
j integer := 1; | |
n1 integer; | |
n2 integer; | |
x text; | |
BEGIN | |
-- Ensure arr1 is the shorter array | |
IF array_length(arr1, 1) > array_length(arr2, 1) THEN | |
SELECT arr2, arr1 INTO arr1, arr2; | |
END IF; | |
n1 := array_length(arr1, 1); | |
n2 := array_length(arr2, 1); | |
IF n1 IS NULL OR n2 IS NULL THEN | |
RETURN FALSE; | |
END IF; | |
WHILE i <= n1 AND j <= n2 LOOP | |
x := arr1[i]; | |
-- Skip elements in arr2 that are less than x | |
WHILE j <= n2 AND arr2[j] < x LOOP | |
j := j + 1; | |
END LOOP; | |
-- Check if we found a match | |
IF j <= n2 AND arr2[j] = x THEN | |
RETURN TRUE; | |
END IF; | |
i := i + 1; | |
END LOOP; | |
RETURN FALSE; | |
END; | |
$$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE; | |
-- Create a temporary table to store our test cases | |
CREATE TEMPORARY TABLE overlap_test_cases ( | |
id SERIAL PRIMARY KEY, | |
description TEXT, | |
arr1 TEXT[], | |
arr2 TEXT[], | |
expected_result BOOLEAN | |
); | |
-- Insert test cases | |
INSERT INTO overlap_test_cases (description, arr1, arr2, expected_result) VALUES | |
('Basic overlap at end', ARRAY['a', 'b', 'c'], ARRAY['c', 'd', 'e'], TRUE), | |
('No overlap', ARRAY['a', 'b', 'c'], ARRAY['d', 'e', 'f'], FALSE), | |
('Overlap in middle', ARRAY['a', 'b', 'c'], ARRAY['b', 'c', 'd'], TRUE), | |
('Empty first array', ARRAY[]::TEXT[], ARRAY['a', 'b', 'c'], FALSE), | |
('Empty second array', ARRAY['a', 'b', 'c'], ARRAY[]::TEXT[], FALSE), | |
('Both arrays empty', ARRAY[]::TEXT[], ARRAY[]::TEXT[], FALSE), | |
('Identical arrays', ARRAY['a', 'b', 'c'], ARRAY['a', 'b', 'c'], TRUE), | |
('Overlap at start', ARRAY['a', 'b', 'c'], ARRAY['a', 'd', 'e'], TRUE), | |
('Single element arrays with overlap', ARRAY['a'], ARRAY['a'], TRUE), | |
('Single element arrays without overlap', ARRAY['a'], ARRAY['b'], FALSE), | |
('First array is prefix of second', ARRAY['a', 'b'], ARRAY['a', 'b', 'c', 'd'], TRUE), | |
('Second array is prefix of first', ARRAY['a', 'b', 'c', 'd'], ARRAY['a', 'b'], TRUE), | |
('Overlap with repeated elements', ARRAY['a', 'b', 'b', 'c'], ARRAY['b', 'b', 'c', 'd'], TRUE), | |
('No overlap with repeated elements', ARRAY['a', 'a', 'b', 'b'], ARRAY['c', 'c', 'd', 'd'], FALSE), | |
('Unicode character overlap', ARRAY['α', 'β', 'γ'], ARRAY['γ', 'δ', 'ε'], TRUE), | |
('Case sensitive no overlap', ARRAY['A', 'B', 'C'], ARRAY['a', 'b', 'c'], FALSE), | |
('Overlap with spaces', ARRAY['a b', 'c d'], ARRAY['c d', 'e f'], TRUE), | |
('Very large arrays with overlap at end', | |
ARRAY(SELECT chr((i % 26) + 65) FROM generate_series(1, 1000) i), | |
ARRAY(SELECT chr((i % 26) + 65) FROM generate_series(1000, 2000) i), | |
TRUE); | |
-- Run test cases | |
SELECT | |
tc.id, | |
tc.description, | |
tc.expected_result, | |
has_overlap(tc.arr1, tc.arr2) AS actual_result, | |
CASE | |
WHEN tc.expected_result = has_overlap(tc.arr1, tc.arr2) THEN 'PASS' | |
ELSE 'FAIL' | |
END AS test_result | |
FROM overlap_test_cases tc | |
ORDER BY tc.id; | |
-- Clean up | |
DROP TABLE overlap_test_cases; | |
DROP SCHEMA IF EXISTS test CASCADE; | |
CREATE SCHEMA test; | |
CREATE TABLE test.test ( | |
x text[] | |
); | |
-- insert 100k rows with 1k random elements each array | |
-- do it in a loop in a function to avoid running out of memory | |
DO $$ | |
DECLARE | |
i int; | |
BEGIN | |
FOR i IN 1..100000 LOOP | |
INSERT INTO test.test (x) | |
SELECT array_sort(array_agg(md5(random()::text))) | |
FROM generate_series(1, 1000) array_gen; | |
COMMIT; | |
END LOOP; | |
END $$; | |
-- now there are 100M unique trace IDs in the table (100k rows * 1k elements each) | |
-- insert a single known row | |
INSERT INTO test.test (x) | |
VALUES (ARRAY['2094799883a44c31f62965e248f61681']); | |
-- https://app.pgmustard.com/#/explore/c0549e1b-ea38-43f6-8e52-77b2f7f57ed2 | |
explain (analyze, buffers, verbose, settings, format json) | |
SELECT count(*) | |
FROM test.test | |
WHERE x && ARRAY['2094799883a44c31f62965e248f61681']; | |
-- https://app.pgmustard.com/#/explore/75d07543-e90f-4bd2-ba7a-3042e4425acc | |
explain (analyze, buffers, verbose, settings, format json) | |
SELECT count(*) | |
FROM test.test | |
WHERE has_overlap(x, ARRAY['2094799883a44c31f62965e248f61681']); | |
-- https://app.pgmustard.com/#/explore/6a90a30a-2e75-48fc-b9fb-530fa0b93f4d | |
explain (analyze, buffers, verbose, settings, format json) | |
SELECT count(*) | |
FROM test.test | |
WHERE x && (SELECT x[500:600] FROM test.test LIMIT 1); | |
-- https://app.pgmustard.com/#/explore/d4c355dd-34e7-4d1f-a11f-cdd9b123186c | |
explain (analyze, buffers, verbose, settings, format json) | |
SELECT count(*) | |
FROM test.test | |
WHERE has_overlap(x, (SELECT x[500:600] FROM test.test LIMIT 1)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment