Skip to content

Instantly share code, notes, and snippets.

@FranckPachot
Created November 12, 2024 16:50
Show Gist options
  • Save FranckPachot/7b978f9aa7dc013e989b454be015d078 to your computer and use it in GitHub Desktop.
Save FranckPachot/7b978f9aa7dc013e989b454be015d078 to your computer and use it in GitHub Desktop.
IndexingWorkshop-DSS2024.sql
# start a YugabyteDB node
docker run -d --name yb yugabytedb/yugabyte:2.23.0.0-b710 \
yugabyted start --enable_pg_parity_tech_preview --background=false
# start PostgreSQL client
docker run --rm -it --link yb:yb \
-e PGHOST=yb -e PGPORT=5433 -e PGUSER=yugabyte postgres psql
select version();
/*
Start here on arm
*/
set client_min_messages to error;
\pset pager off
select version();
drop table if exists large_table ;
create extension if not exists pgcrypto;
create table large_table (
id uuid primary key default gen_random_uuid()
,col1 int not null
,col2 int not null
,col3 text
,col4 int not null
,col5 text
,col6 text
);
insert into large_table (col1, col2, col3, col4, col5, col6)
select
(random() * 3)::int,
(random() * 50)::int,
md5(random()::text),
(random() * 5)::int,
md5(random()::text),
md5(random()::text)
from generate_series(1, 1000000);
analyze large_table ;
prepare query as
select col4, col5, col3, col6
from large_table
where col1 = $1
and col2 between $2 and $3
and col4 in ( 1, 3, 5 )
order by col2 asc, col3 desc
limit 10;
execute query(1,24,42);
explain (analyze, costs off, dist , summary off)
execute query(1,24,42);
-- Concepts
create index on large_table (col1 asc);
explain (analyze, costs off, summary off)
execute query(1,24,42);
deallocate query;
prepare query as /*+ noseqscan(large_table) */
select col4, col5, col3, col6
from large_table
where col1 = $1
and col2 between $2 and $3
and col4 in ( 1, 3, 5 )
order by col2 asc, col3 desc
limit 10;
explain (analyze, costs off, summary off, dist)
execute query(1,24,42);
create index on large_table (col1 asc, col2 asc);
explain (analyze, costs on, summary off, dist)
execute query(1,24,42);
create index on large_table (col1 asc, col2 asc, col3 desc);
explain (analyze, costs off, summary off, dist)
execute query(1,24,42);
create index on large_table (col1 asc, col2 asc, col3 desc) include (col4);
explain (analyze, costs off, summary off, dist)
execute query(1,24,42);
explain (analyze, costs on, summary off, dist, verbose)
execute query(1,24,42);
create index on large_table (col1 asc, col2 asc, col3 desc) include (col4, col5, col6);
explain (analyze, costs off, summary off, dist, verbose)
execute query(1,24,42);
-- Sounds too good?
\d large_table
explain (analyze, costs off, summary on, dist)
update large_table set col1=4 where col1=2 and col2=1;
explain (analyze, costs off, summary on, dist)
update large_table set col4=4 where col1=2 and col2=2;
drop index large_table_col1_col2_col3_col4_col5_col6_idx;
drop index large_table_col1_col2_col3_col4_idx;
drop index large_table_col1_col2_idx;
drop index large_table_col1_col2_col3_idx;
-- Trade-offs
create index on large_table (col1 asc, col4 asc, col2 asc, col3 asc ) include (col5, col6);
explain (analyze, costs off, summary off, dist, verbose)
execute query(1,24,42);
explain (analyze, dist, costs off, summary off)
select col4, col5, col3, col6
from large_table
where col1 = 1
and col2 between 24 and 42
and col4 in ( 1, 3, 5 )
order by col4
limit 10;
create index on large_table (col1 asc, col2 asc, col3 desc)
include (col4, col5, col6)
where col4 in (1,3,5);
explain (analyze, costs on, summary off, dist, verbose)
execute query(1,24,42);
--> What about the query planner?
select attname, n_distinct, most_common_vals, most_common_freqs
from pg_stats where tablename ='large_table' and attname in ('col1','col4');
select indexrelname,idx_scan,idx_tup_read,idx_tup_fetch from pg_stat_user_indexes
where relname ='large_table';
create statistics col5_col6 on col5, col6 from large_table;
create statistics col1_col2_col4 on col1, col2, col3, col4 from large_table;
analyze large_table;
select stxname, stxkeys, stxddependencies, stxdndistinct
from pg_statistic_ext join pg_statistic_ext_data on (oid = stxoid);
--------------------------------------------------------------------------------------------------------------------
-- A complete Example
--drop schema if exists demo cascade;
create schema if not exists demo;
create table demo.events (
id uuid not null,
system_id uuid not null,
event_type varchar(100) not null,
created_at timestamptz not null,
updated_at timestamptz not null,
state varchar(100) not null,
customer_id uuid,
device uuid,
primary key (id)
);
create index events_system_id on demo.events
(system_id);
-- execution plan
\c
\d demo.events
select pg_get_indexdef('demo.events_system_id'::regclass::oid);
explain (costs off)
/*+ indexscan(events events_system_id) */
select *
from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
order by events.created_at asc limit 100;
-- testing with data
create extension if not exists pgcrypto;
--- use non-transactional writes for fast bulk load in yugabytedb
set yb_disable_transactional_writes=on;
insert into demo.events (id, system_id, event_type, created_at, updated_at, state, customer_id, device)
select
gen_random_uuid(), -- id
case when random()>0.8 then '0188e8d7-dead-73c1-b218-7cf069921b5b' else gen_random_uuid() end, -- system_id
(array['onboarding', 'monitoring', 'transferring', 'maintenance', 'upgrade'])[floor(random() * 5 + 1)], -- event_type
now() - interval '1 year' * random(), -- created_at (random time within the last year)
now() - interval '1 year' * random(), -- updated_at (random time within the last year)
(array['event_not_started', 'event_in_progress', 'event_monitoring_not_started', 'event_monitoring_in_progress', 'event_payment_screening_not_started', 'event_payment_screening_in_progress', 'event_transferring_not_started', 'event_transferring_in_progress', 'event_completed', 'event_cancelled'])[floor(random() * 10 + 1)], -- state
gen_random_uuid(), -- customer_id
gen_random_uuid() -- device
from generate_series(1, 1000000);
--- reset the transactional writes to be fully acid
set yb_disable_transactional_writes=off;
analyze demo.events;
-- explain analyze
explain (analyze, dist, verbose, costs off)
/*+ indexscan(events events_system_id) */
select *
from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
order by events.created_at asc limit 100;
-- access by single "system_id" and multi-value filter on "state" and "event_type" by query xyz
create index i1_ise on demo.events (system_id, state, event_type)
--;
select pg_get_indexdef('demo.i1_ise'::regclass::oid);
explain (analyze, dist, verbose, costs off)
/*+ indexscan(events i1_ise ) */
select *
from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
order by events.created_at asc limit 100;
-- index condition: access, skip scan, and filter
explain (analyze, dist, debug, verbose, costs off, summary off)
/*+ indexscan(events i1_ise ) */
select count(*) from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b';
explain (analyze, dist, debug, verbose, costs off, summary off)
/*+ indexscan(events i1_ise ) */
select count(*) from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
;
select
case when
events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in ( 'onboarding', 'monitoring', 'transferring')
and
events.state in ( 'event_not_started', 'event_in_progress', 'event_monitoring_not_started')
) then '✅' else '❌' end keep
, system_id, state, event_type
,count(*)
from demo.events
where system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
group by system_id, state, event_type
order by system_id, state, event_type
;
-- partial index
create index i1_i_partial on demo.events (system_id)
where (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
--;
select pg_get_indexdef('demo.i1_i_partial'::regclass::oid);
explain (analyze, dist, debug, verbose, costs off, summary off)
/*+ indexscan(events i1_i_partial ) */
select count(*) from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
;
explain (analyze, dist, verbose, costs off)
/*+ indexscan(events i1_i_partial ) */
select *
from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
order by events.created_at asc limit 100;
-- fetch size
\dconfig yb_fetch*
set yb_fetch_row_limit = 100000;
explain (analyze, dist, verbose, costs off)
/*+ indexscan(events i1_i_partial ) */
select *
from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
order by events.created_at asc limit 100;
-- order-preserving index
create index i1_icse
on demo.events (system_id, created_at, state, event_type)
--;
select pg_get_indexdef('demo.i1_icse'::regclass::oid);
explain (analyze, dist, verbose, costs off)
/*+ indexscan(events i1_icse ) */
select *
from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
order by events.created_at asc limit 100;
-- index only scan
create index i1_icse_covering
on demo.events (system_id, created_at, state, event_type)
include (id, updated_at, customer_id, device)
--;
select pg_get_indexdef('demo.i1_icse_covering'::regclass::oid);
explain (analyze, dist, debug, verbose, costs off)
/*+ indexscan(events i1_icse_covering ) */
select *
from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
order by events.created_at asc limit 100;
-- the best index for this query
create index i1_icse_best on demo.events (system_id, created_at) include (id, updated_at, customer_id, device, event_type, state)
where (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
--;
select pg_get_indexdef('demo.i1_icse_best'::regclass::oid);
explain (analyze, dist, debug, verbose, costs off)
/*+ indexscan(events i1_icse_best ) */
select *
from demo.events
where events.system_id = '0188e8d7-dead-73c1-b218-7cf069921b5b'
and (
events.event_type in (
'onboarding',
'monitoring',
'transferring'
)
and
events.state in (
'event_not_started',
'event_in_progress',
'event_monitoring_not_started'
)
)
order by events.created_at asc limit 100;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment