Created
October 16, 2015 14:18
-
-
Save pramsey/a4b25b855c6904b5be91 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| -------------------------------------------------------------------------------- | |
| -- start with a 14M record table | |
| -- | |
| taxi=# select count(*) from taxis ; | |
| count | |
| ---------- | |
| 14863748 | |
| (1 row) | |
| Time: 2730.094 ms | |
| -------------------------------------------------------------------------------- | |
| -- make a 1% sample | |
| -- | |
| taxi=# create table taxis_small as select * from taxis where random() < 0.01; | |
| SELECT 147980 | |
| Time: 3306.531 ms | |
| -------------------------------------------------------------------------------- | |
| -- look, summary stats are just as good on the sample | |
| -- | |
| taxi=# select avg(trip_time_in_seconds) from taxis; | |
| avg | |
| ------------------ | |
| 10.7779481042417 | |
| (1 row) | |
| Time: 5426.898 ms | |
| taxi=# select avg(trip_time_in_seconds) from taxis_small; | |
| avg | |
| ------------------ | |
| 10.7622901744179 | |
| (1 row) | |
| Time: 51.605 ms | |
| -------------------------------------------------------------------------------- | |
| -- so are roll-ups and histograms | |
| -- | |
| taxi=# SELECT vendor_id, 100.0*count(*)/sum(count(*)) OVER () AS share | |
| FROM taxis group by vendor_id; | |
| vendor_id | share | |
| -----------+--------------------- | |
| VTS | 50.1149373630392550 | |
| DDS | 5.6757622639996319 | |
| CMT | 44.2093003729611132 | |
| (3 rows) | |
| Time: 7227.063 ms | |
| taxi=# SELECT vendor_id, 100.0*count(*)/sum(count(*)) OVER () AS share | |
| FROM taxis_small group by vendor_id; | |
| vendor_id | share | |
| -----------+--------------------- | |
| VTS | 50.4466819840518989 | |
| DDS | 5.6358967428030815 | |
| CMT | 43.9174212731450196 | |
| (3 rows) | |
| Time: 68.610 ms | |
| -------------------------------------------------------------------------------- | |
| -- but, what about the really infrequent results, so infrequent they | |
| -- don't show up in the sample? like those taxi trips with 208 | |
| -- passengers? | |
| -- | |
| taxi=# SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share | |
| FROM taxis group by passenger_count order by share desc; | |
| passenger_count | share | |
| -----------------+---------------------------- | |
| 1 | 68.0126304617112723 | |
| 2 | 16.2959369332687825 | |
| 5 | 8.6556365191336667 | |
| 3 | 4.4585793569697226 | |
| 4 | 2.1332910111231703 | |
| 6 | 0.44384498445479565450 | |
| 208 | 0.000067277782158308927197 | |
| 0 | 0.000013455556431661785439 | |
| (8 rows) | |
| Time: 7768.345 ms | |
| taxi=# SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share | |
| FROM taxis_small group by passenger_count order by share desc; | |
| passenger_count | share | |
| -----------------+------------------------ | |
| 1 | 67.9172861197459116 | |
| 2 | 16.2494931747533450 | |
| 5 | 8.7187457764562779 | |
| 3 | 4.4938505203405866 | |
| 4 | 2.1888092985538586 | |
| 6 | 0.43181511015002027301 | |
| -------------------------------------------------------------------------------- | |
| -- well, what if we have an index, and we analyze it | |
| -- | |
| taxi=# create index passenger_count_x on taxis (passenger_count); | |
| taxi=# analyze taxis | |
| -------------------------------------------------------------------------------- | |
| -- the index is still fast for selective things, look! 1.3ms on the main table | |
| -- | |
| taxi=# SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share | |
| taxi-# FROM taxis where passenger_count > 6 group by passenger_count order by share desc; | |
| passenger_count | share | |
| -----------------+---------------------- | |
| 208 | 100.0000000000000000 | |
| (1 row) | |
| Time: 1.302 ms | |
| -------------------------------------------------------------------------------- | |
| -- although, it can get slow if we accidentally do non-selective things... | |
| -- | |
| taxi=# SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share | |
| FROM taxis where passenger_count > 5 group by passenger_count order by share desc; | |
| passenger_count | share | |
| -----------------+------------------------ | |
| 6 | 99.9848443514898002 | |
| 208 | 0.01515564851019975145 | |
| (2 rows) | |
| Time: 1091.904 ms | |
| -------------------------------------------------------------------------------- | |
| -- but the planner knows the difference! it takes just 5ms to figure out | |
| -- if a query on the main table is going to hose us. so we can check and | |
| -- only run big table queries where the "cost" is less than some threshold number | |
| -- and delegate upwards to sample tables for other queries | |
| -- | |
| taxi=# explain SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share | |
| FROM taxis where passenger_count > 5 group by passenger_count order by share desc; | |
| QUERY PLAN | |
| --------------------------------------------------------------------------------------------------------- | |
| Sort (cost=142670.45..142670.45 rows=1 width=4) | |
| Sort Key: (((100.0 * ((count(*)))::numeric) / sum((count(*))) OVER (?))) | |
| -> WindowAgg (cost=142670.40..142670.44 rows=1 width=4) | |
| -> HashAggregate (cost=142670.40..142670.42 rows=1 width=4) | |
| -> Bitmap Heap Scan on taxis (cost=1246.97..142172.47 rows=66391 width=4) | |
| Recheck Cond: (passenger_count > 5) | |
| -> Bitmap Index Scan on passenger_count_x (cost=0.00..1230.37 rows=66391 width=0) | |
| Index Cond: (passenger_count > 5) | |
| (8 rows) | |
| Time: 5.330 ms | |
| taxi=# explain SELECT passenger_count, 100.0*count(*)/sum(count(*)) OVER () AS share | |
| FROM taxis where passenger_count > 6 group by passenger_count order by share desc; | |
| QUERY PLAN | |
| ------------------------------------------------------------------------------------------------------ | |
| Sort (cost=8.51..8.51 rows=1 width=4) | |
| Sort Key: (((100.0 * ((count(*)))::numeric) / sum((count(*))) OVER (?))) | |
| -> WindowAgg (cost=0.43..8.50 rows=1 width=4) | |
| -> GroupAggregate (cost=0.43..8.48 rows=1 width=4) | |
| -> Index Only Scan using passenger_count_x on taxis (cost=0.43..8.45 rows=1 width=4) | |
| Index Cond: (passenger_count > 6) | |
| (6 rows) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment