Skip to content

Instantly share code, notes, and snippets.

@matheusoliveira
Created February 21, 2015 16:32
Show Gist options
  • Save matheusoliveira/3c6f26626ea1eedb5c2a to your computer and use it in GitHub Desktop.
Save matheusoliveira/3c6f26626ea1eedb5c2a to your computer and use it in GitHub Desktop.
A very optimized way of getting random rows of a table on PostgreSQL (gets by "id" in a fast and non-biased way, if with gaps)
/* One random row from table "tbl" */
WITH RECURSIVE r AS (
SELECT NULL::int AS id, min(id) AS min_id, max(id) AS max_id, 0 AS cnt
FROM tbl
UNION ALL
SELECT tbl.id, r.min_id, r.max_id, r.cnt + 1
FROM r LEFT JOIN tbl
ON tbl.id = (SELECT floor(random() * (r.max_id - r.min_id + 1))::int)
WHERE r.id IS NULL
)
SELECT r.id, r.cnt FROM r WHERE r.id IS NOT NULL;
/* 10 (easily adjustable) random rows from table "tbl" (doesn't handle duplication, a simple exercise for the reader [tip: arrays and ANY operator]) */
WITH RECURSIVE r AS (
SELECT NULL::int AS id, min(id) AS min_id, max(id) AS max_id, 0 AS cnt, 0 AS taken
FROM tbl2
UNION ALL
SELECT tbl2.id, r.min_id, r.max_id, r.cnt + 1, r.taken + CASE WHEN tbl2.id IS NULL THEN 0 ELSE 1 END
FROM r LEFT JOIN tbl2
ON tbl2.id = (SELECT floor(random() * (r.max_id - r.min_id + 1))::int)
WHERE r.taken < 10
)
SELECT r.id, r.cnt FROM r WHERE r.id IS NOT NULL;
/* OBS: On both "cnt" result is just to show how many iterations it took to return */
CREATE TABLE tbl AS SELECT id FROM generate_series(1, 10000000, 50 /* some gaps */) id;
CREATE INDEX ON tbl (id);
VACUUM ANALYZE tbl;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment