agsha · March 30, 2020 13:16
diff --git a/postgres bulk percentiles calculation b/postgres bulk percentiles calculation

 This will return the percentiles in order.

 -- #1
 select unnest(
 percentile_cont(
 	(select array_agg(s) from generate_series(0, 1, 0.2) as s)
 ) WITHIN GROUP (ORDER BY SIZE))
 from mytable


 Made easier using LATERAL, and this one gives the (fractional) percentile too: 

 -- #2
 select percentile, value
 from generate_series(0, 1, 0.2) as percentile,
 lateral (select percentile_cont(percentile) WITHIN GROUP (ORDER BY SIZE) as value from mytable) values

 The lateral approach is a lot slower, though. Calculation time scales linearly based on the number of percentiles you want to calculate - percentile_cont() is being called for each item in generate_series, while the version above just runs it once. 


 This is the best way I've found to use the first method while returning the input percentile - though this is pretty hacky, as it more or less makes up a column to join on. 

 -- #3
 select pct, value
 from (select row_number() OVER () as rownum, value
 	from (select unnest(
 		percentile_cont(
 			(select array_agg(s) from generate_series(0, 1, 0.2) as s)
 		) WITHIN GROUP (ORDER BY SIZE)) as value
 		from mytable) s2
 	) s3
 join (select row_number() OVER () as rownum, pct
 	from (select generate_series(0, 1, 0.2) as pct) s) p
 	on p.rownum = s3.rownum


 ntile() looks a bit better but getting lower/upper bounds is a bit clumsy, and values here are not quite the same as percentile_cont. (Might be the same as percentile_disc().)
 This is about half as fast as query #3 on my dataset. 

 -- #4
 select ((nt-1)/5::float) as pctile,  min(size)
 from (select size, ntile(6) over (order by size) as nt
   from mytable
 ) as dt
 group by nt
 order by nt asc


 This is the approach I selected. A lot cleaner than #3 above.

 SELECT unnest(
  (select array_agg(fraction) from generate_series(0, 1, 0.2) AS fraction)
 ) as percentile,
 unnest(
  (select percentile_cont((select array_agg(s) from generate_series(0, 1, 0.2) as s)) WITHIN GROUP (ORDER BY SIZE) FROM mytable)
 ) as value;


 -- Or with manually defined percentiles (tested on pg 9.6)
 select unnest(
 	(ARRAY[0.5, 0.75, 0.9, 0.95, 0.97, 0.98, 0.99, 1])
 ) as percentile,
 unnest(
 	(select percentile_cont(ARRAY[0.5, 0.75, 0.9, 0.95, 0.97, 0.98, 0.99, 1]) 
 		WITHIN GROUP (ORDER BY value) FROM mytable
 	)
 ) as value

	This will return the percentiles in order.

	-- #1
	select unnest(
	percentile_cont(
	(select array_agg(s) from generate_series(0, 1, 0.2) as s)
	) WITHIN GROUP (ORDER BY SIZE))
	from mytable


	Made easier using LATERAL, and this one gives the (fractional) percentile too:

	-- #2
	select percentile, value
	from generate_series(0, 1, 0.2) as percentile,
	lateral (select percentile_cont(percentile) WITHIN GROUP (ORDER BY SIZE) as value from mytable) values

	The lateral approach is a lot slower, though. Calculation time scales linearly based on the number of percentiles you want to calculate - percentile_cont() is being called for each item in generate_series, while the version above just runs it once.


	This is the best way I've found to use the first method while returning the input percentile - though this is pretty hacky, as it more or less makes up a column to join on.

	-- #3
	select pct, value
	from (select row_number() OVER () as rownum, value
	from (select unnest(
	percentile_cont(
	(select array_agg(s) from generate_series(0, 1, 0.2) as s)
	) WITHIN GROUP (ORDER BY SIZE)) as value
	from mytable) s2
	) s3
	join (select row_number() OVER () as rownum, pct
	from (select generate_series(0, 1, 0.2) as pct) s) p
	on p.rownum = s3.rownum


	ntile() looks a bit better but getting lower/upper bounds is a bit clumsy, and values here are not quite the same as percentile_cont. (Might be the same as percentile_disc().)
	This is about half as fast as query #3 on my dataset.

	-- #4
	select ((nt-1)/5::float) as pctile, min(size)
	from (select size, ntile(6) over (order by size) as nt
	from mytable
	) as dt
	group by nt
	order by nt asc


	This is the approach I selected. A lot cleaner than #3 above.

	SELECT unnest(
	(select array_agg(fraction) from generate_series(0, 1, 0.2) AS fraction)
	) as percentile,
	unnest(
	(select percentile_cont((select array_agg(s) from generate_series(0, 1, 0.2) as s)) WITHIN GROUP (ORDER BY SIZE) FROM mytable)
	) as value;


	-- Or with manually defined percentiles (tested on pg 9.6)
	select unnest(
	(ARRAY[0.5, 0.75, 0.9, 0.95, 0.97, 0.98, 0.99, 1])
	) as percentile,
	unnest(
	(select percentile_cont(ARRAY[0.5, 0.75, 0.9, 0.95, 0.97, 0.98, 0.99, 1])
	WITHIN GROUP (ORDER BY value) FROM mytable
	)
	) as value