pramsey · October 16, 2015 14:18
diff --git a/sample_tables.txt b/sample_tables.txt
 --------------------------------------------------------------------------------
 -- start with a 14M record table
 --
 taxi=# select count(*) from taxis ;
  count   
 ----------
 14863748
 (1 row)

 Time: 2730.094 ms


 --------------------------------------------------------------------------------
 -- make a 1% sample
 --
 taxi=# create table taxis_small as select * from taxis where random() < 0.01;
 SELECT 147980
 Time: 3306.531 ms

 --------------------------------------------------------------------------------
 -- look, summary stats are just as good on the sample
 --
 taxi=# select avg(trip_time_in_seconds) from taxis;
       avg        
 ------------------
 10.7779481042417
 (1 row)

 Time: 5426.898 ms

 taxi=# select avg(trip_time_in_seconds) from taxis_small;
       avg        
 ------------------
 10.7622901744179
 (1 row)

 Time: 51.605 ms

 --------------------------------------------------------------------------------
 -- so are roll-ups and histograms
 --
 taxi=# SELECT vendor_id, 100.0*count(*)/sum(count(*)) OVER () AS share 
       FROM taxis group by vendor_id;
 vendor_id |        share        
 -----------+---------------------
 VTS       | 50.1149373630392550
 DDS       |  5.6757622639996319
 CMT       | 44.2093003729611132
 (3 rows)

 Time: 7227.063 ms
 taxi=# SELECT vendor_id, 100.0*count(*)/sum(count(*)) OVER () AS share 
       FROM taxis_small group by vendor_id;
 vendor_id |        share        
 -----------+---------------------
 VTS       | 50.4466819840518989
 DDS       |  5.6358967428030815
 CMT       | 43.9174212731450196
 (3 rows)

 Time: 68.610 ms


 --------------------------------------------------------------------------------
 -- but, what about the really infrequent results, so infrequent they 
 -- don't show up in the sample? like those taxi trips with 208 
 -- passengers?
 --
 taxi=# SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share 
       FROM taxis group by passenger_count order by share desc;
 passenger_count |           share            
 -----------------+----------------------------
               1 |        68.0126304617112723
               2 |        16.2959369332687825
               5 |         8.6556365191336667
               3 |         4.4585793569697226
               4 |         2.1332910111231703
               6 |     0.44384498445479565450
             208 | 0.000067277782158308927197
               0 | 0.000013455556431661785439
 (8 rows)

 Time: 7768.345 ms
 taxi=# SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share 
       FROM taxis_small group by passenger_count order by share desc;
 passenger_count |         share          
 -----------------+------------------------
               1 |    67.9172861197459116
               2 |    16.2494931747533450
               5 |     8.7187457764562779
               3 |     4.4938505203405866
               4 |     2.1888092985538586
               6 | 0.43181511015002027301


 --------------------------------------------------------------------------------
 -- well, what if we have an index, and we analyze it
 --
 taxi=# create index passenger_count_x on taxis (passenger_count);
 taxi=# analyze taxis

 --------------------------------------------------------------------------------
 -- the index is still fast for selective things, look! 1.3ms on the main table
 --
 taxi=# SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share 
 taxi-#        FROM taxis where passenger_count > 6 group by passenger_count order by share desc;
 passenger_count |        share         
 -----------------+----------------------
             208 | 100.0000000000000000
 (1 row)

 Time: 1.302 ms

 --------------------------------------------------------------------------------
 -- although, it can get slow if we accidentally do non-selective things...
 --
 taxi=# SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share 
       FROM taxis where passenger_count > 5 group by passenger_count order by share desc;
 passenger_count |         share          
 -----------------+------------------------
               6 |    99.9848443514898002
             208 | 0.01515564851019975145
 (2 rows)

 Time: 1091.904 ms

 --------------------------------------------------------------------------------
 -- but the planner knows the difference! it takes just 5ms to figure out
 -- if a query on the main table is going to hose us. so we can check and 
 -- only run big table queries where the "cost" is less than some threshold number
 -- and delegate upwards to sample tables for other queries
 --
 taxi=# explain SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share 
       FROM taxis where passenger_count > 5 group by passenger_count order by share desc;
                                               QUERY PLAN                                                
 ---------------------------------------------------------------------------------------------------------
 Sort  (cost=142670.45..142670.45 rows=1 width=4)
   Sort Key: (((100.0 * ((count(*)))::numeric) / sum((count(*))) OVER (?)))
   ->  WindowAgg  (cost=142670.40..142670.44 rows=1 width=4)
         ->  HashAggregate  (cost=142670.40..142670.42 rows=1 width=4)
               ->  Bitmap Heap Scan on taxis  (cost=1246.97..142172.47 rows=66391 width=4)
                     Recheck Cond: (passenger_count > 5)
                     ->  Bitmap Index Scan on passenger_count_x  (cost=0.00..1230.37 rows=66391 width=0)
                           Index Cond: (passenger_count > 5)
 (8 rows)

 Time: 5.330 ms


 taxi=# explain SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share 
       FROM taxis where passenger_count > 6 group by passenger_count order by share desc;
                                              QUERY PLAN                                              
 ------------------------------------------------------------------------------------------------------
 Sort  (cost=8.51..8.51 rows=1 width=4)
   Sort Key: (((100.0 * ((count(*)))::numeric) / sum((count(*))) OVER (?)))
   ->  WindowAgg  (cost=0.43..8.50 rows=1 width=4)
         ->  GroupAggregate  (cost=0.43..8.48 rows=1 width=4)
               ->  Index Only Scan using passenger_count_x on taxis  (cost=0.43..8.45 rows=1 width=4)
                     Index Cond: (passenger_count > 6)
 (6 rows)
	--------------------------------------------------------------------------------
	-- start with a 14M record table
	--
	taxi=# select count(*) from taxis ;
	count
	----------
	14863748
	(1 row)

	Time: 2730.094 ms


	--------------------------------------------------------------------------------
	-- make a 1% sample
	--
	taxi=# create table taxis_small as select * from taxis where random() < 0.01;
	SELECT 147980
	Time: 3306.531 ms

	--------------------------------------------------------------------------------
	-- look, summary stats are just as good on the sample
	--
	taxi=# select avg(trip_time_in_seconds) from taxis;
	avg
	------------------
	10.7779481042417
	(1 row)

	Time: 5426.898 ms

	taxi=# select avg(trip_time_in_seconds) from taxis_small;
	avg
	------------------
	10.7622901744179
	(1 row)

	Time: 51.605 ms

	--------------------------------------------------------------------------------
	-- so are roll-ups and histograms
	--
	taxi=# SELECT vendor_id, 100.0count()/sum(count(*)) OVER () AS share
	FROM taxis group by vendor_id;
	vendor_id \| share
	-----------+---------------------
	VTS \| 50.1149373630392550
	DDS \| 5.6757622639996319
	CMT \| 44.2093003729611132
	(3 rows)

	Time: 7227.063 ms
	taxi=# SELECT vendor_id, 100.0count()/sum(count(*)) OVER () AS share
	FROM taxis_small group by vendor_id;
	vendor_id \| share
	-----------+---------------------
	VTS \| 50.4466819840518989
	DDS \| 5.6358967428030815
	CMT \| 43.9174212731450196
	(3 rows)

	Time: 68.610 ms


	--------------------------------------------------------------------------------
	-- but, what about the really infrequent results, so infrequent they
	-- don't show up in the sample? like those taxi trips with 208
	-- passengers?
	--
	taxi=# SELECT passenger_count, 100.0count()/sum(count(*)) OVER () AS share
	FROM taxis group by passenger_count order by share desc;
	passenger_count \| share
	-----------------+----------------------------
	1 \| 68.0126304617112723
	2 \| 16.2959369332687825
	5 \| 8.6556365191336667
	3 \| 4.4585793569697226
	4 \| 2.1332910111231703
	6 \| 0.44384498445479565450
	208 \| 0.000067277782158308927197
	0 \| 0.000013455556431661785439
	(8 rows)

	Time: 7768.345 ms
	taxi=# SELECT passenger_count, 100.0count()/sum(count(*)) OVER () AS share
	FROM taxis_small group by passenger_count order by share desc;
	passenger_count \| share
	-----------------+------------------------
	1 \| 67.9172861197459116
	2 \| 16.2494931747533450
	5 \| 8.7187457764562779
	3 \| 4.4938505203405866
	4 \| 2.1888092985538586
	6 \| 0.43181511015002027301


	--------------------------------------------------------------------------------
	-- well, what if we have an index, and we analyze it
	--
	taxi=# create index passenger_count_x on taxis (passenger_count);
	taxi=# analyze taxis

	--------------------------------------------------------------------------------
	-- the index is still fast for selective things, look! 1.3ms on the main table
	--
	taxi=# SELECT passenger_count, 100.0count()/sum(count(*)) OVER () AS share
	taxi-# FROM taxis where passenger_count > 6 group by passenger_count order by share desc;
	passenger_count \| share
	-----------------+----------------------
	208 \| 100.0000000000000000
	(1 row)

	Time: 1.302 ms

	--------------------------------------------------------------------------------
	-- although, it can get slow if we accidentally do non-selective things...
	--
	taxi=# SELECT passenger_count, 100.0count()/sum(count(*)) OVER () AS share
	FROM taxis where passenger_count > 5 group by passenger_count order by share desc;
	passenger_count \| share
	-----------------+------------------------
	6 \| 99.9848443514898002
	208 \| 0.01515564851019975145
	(2 rows)

	Time: 1091.904 ms

	--------------------------------------------------------------------------------
	-- but the planner knows the difference! it takes just 5ms to figure out
	-- if a query on the main table is going to hose us. so we can check and
	-- only run big table queries where the "cost" is less than some threshold number
	-- and delegate upwards to sample tables for other queries
	--
	taxi=# explain SELECT passenger_count, 100.0count()/sum(count(*)) OVER () AS share
	FROM taxis where passenger_count > 5 group by passenger_count order by share desc;
	QUERY PLAN
	---------------------------------------------------------------------------------------------------------
	Sort (cost=142670.45..142670.45 rows=1 width=4)
	Sort Key: (((100.0 * ((count()))::numeric) / sum((count())) OVER (?)))
	-> WindowAgg (cost=142670.40..142670.44 rows=1 width=4)
	-> HashAggregate (cost=142670.40..142670.42 rows=1 width=4)
	-> Bitmap Heap Scan on taxis (cost=1246.97..142172.47 rows=66391 width=4)
	Recheck Cond: (passenger_count > 5)
	-> Bitmap Index Scan on passenger_count_x (cost=0.00..1230.37 rows=66391 width=0)
	Index Cond: (passenger_count > 5)
	(8 rows)

	Time: 5.330 ms


	taxi=# explain SELECT passenger_count, 100.0count()/sum(count(*)) OVER () AS share
	FROM taxis where passenger_count > 6 group by passenger_count order by share desc;
	QUERY PLAN
	------------------------------------------------------------------------------------------------------
	Sort (cost=8.51..8.51 rows=1 width=4)
	Sort Key: (((100.0 * ((count()))::numeric) / sum((count())) OVER (?)))
	-> WindowAgg (cost=0.43..8.50 rows=1 width=4)
	-> GroupAggregate (cost=0.43..8.48 rows=1 width=4)
	-> Index Only Scan using passenger_count_x on taxis (cost=0.43..8.45 rows=1 width=4)
	Index Cond: (passenger_count > 6)
	(6 rows)
No results found