Last active
April 19, 2023 20:28
-
-
Save wolever/9164392 to your computer and use it in GitHub Desktop.
Functions to create and draw histograms with PostgreSQL.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Functions to create and draw histograms with PostgreSQL. | |
-- | |
-- psql# WITH email_lengths AS ( | |
-- -# SELECT length(email) AS length | |
-- -# FROM auth_user | |
-- -# LIMIT 100 | |
-- -# ) | |
-- -# SELECT * FROM show_histogram((SELECT histogram(length, 0, 32, 6) FROM email_lengths)) | |
-- bucket | range | count | bar | cumbar | cumsum | cumpct | |
-- --------+-------------------------------------+-------+--------------------------------+--------------------------------+--------+------------------------ | |
-- 0 | [0,5.33333333333333) | 1 | | | 1 | 0.00273224043715846995 | |
-- 1 | [5.33333333333333,10.6666666666667) | 5 | = | | 6 | 0.01639344262295081967 | |
-- 2 | [10.6666666666667,16) | 149 | ============================== | ============= | 155 | 0.42349726775956284153 | |
-- 3 | [16,21.3333333333333) | 145 | ============================= | ========================= | 300 | 0.81967213114754098361 | |
-- 4 | [21.3333333333333,26.6666666666667) | 49 | ========== | ============================= | 349 | 0.95355191256830601093 | |
-- 5 | [26.6666666666667,32) | 17 | === | ============================== | 366 | 1.00000000000000000000 | |
-- (6 rows) | |
-- psql# | |
DROP TYPE IF EXISTS floatrange CASCADE; | |
CREATE TYPE floatrange AS RANGE ( | |
subtype = float8, | |
subtype_diff = float8mi | |
); | |
DROP TYPE IF EXISTS histogram_result CASCADE; | |
CREATE TYPE histogram_result AS ( | |
count INTEGER, | |
bucket INTEGER, | |
range floatrange | |
); | |
CREATE OR REPLACE FUNCTION hist_sfunc(state histogram_result[], val float8, min float8, max float8, nbuckets INTEGER) RETURNS histogram_result[] AS $$ | |
DECLARE | |
bucket INTEGER; | |
width float8; | |
i INTEGER; | |
BEGIN | |
-- width_bucket uses nbuckets + 1 (!) and starts at 1. | |
bucket := width_bucket(val, min, max, nbuckets - 1) - 1; | |
-- Init the array with the correct number of 0's so the caller doesn't see NULLs | |
IF state[0] IS NULL THEN | |
width := (max - min) / nbuckets; | |
FOR i IN SELECT * FROM generate_series(0, nbuckets - 1) LOOP | |
state[i] := (0, i, floatrange(i * width, (i + 1) * width)); | |
END LOOP; | |
END IF; | |
state[bucket] = (state[bucket].count + 1, state[bucket].bucket, state[bucket].range); | |
RETURN state; | |
END; | |
$$ LANGUAGE plpgsql IMMUTABLE; | |
CREATE AGGREGATE histogram(float8, float8, float8, INTEGER) ( | |
SFUNC = hist_sfunc, | |
STYPE = histogram_result[] | |
); | |
CREATE OR REPLACE FUNCTION histobar(v float8, tick_size float8) | |
RETURNS TEXT AS $$ | |
SELECT repeat('=', (v * tick_size)::integer); | |
$$ LANGUAGE SQL; | |
CREATE OR REPLACE FUNCTION show_histogram(h histogram_result[]) | |
RETURNS TABLE(bucket INTEGER, range floatrange, count INTEGER, bar TEXT, cumbar TEXT, cumsum INTEGER, cumpct NUMERIC) AS $$ | |
DECLARE | |
r histogram_result; | |
min_count integer := (select min(x.count) from unnest(h) as x); | |
max_count integer := (select max(x.count) from unnest(h) as x); | |
total_count integer := (select sum(x.count) from unnest(h) as x); | |
bar_max_width integer := 30; | |
bar_tick_size float8 := bar_max_width / (max_count - min_count)::float8; | |
bar text; | |
cumsum integer := 0; | |
cumpct numeric; | |
BEGIN | |
FOREACH r IN ARRAY h LOOP | |
IF r.bucket IS NULL THEN | |
CONTINUE; | |
END IF; | |
cumsum := cumsum + r.count; | |
cumpct := (cumsum::numeric / total_count); | |
bar := histobar(r.count, bar_tick_size); | |
RETURN QUERY VALUES ( | |
r.bucket, | |
r.range, | |
r.count, | |
bar, | |
histobar(cumpct, bar_max_width), | |
cumsum, | |
cumpct | |
); | |
END loop; | |
END; | |
$$ LANGUAGE plpgsql; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@wolever thanks for this. Would you consider supporting negative range values? E.g.
SELECT * FROM show_histogram((SELECT histogram(returns, -100, 100, 10) FROM data));
Results in
Expected: bucket 0 should be
[-100, -80]
.