Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save JoshCheek/aac5ef253897a636acc63d433650cbd4 to your computer and use it in GitHub Desktop.
Save JoshCheek/aac5ef253897a636acc63d433650cbd4 to your computer and use it in GitHub Desktop.
Left join's "gotcha": Counting number of associated rows
create table xs (id varchar primary key);
create table ys (id serial primary key, x_id varchar); -- first value joined to x
create table zs (id serial primary key, x_id varchar); -- second value joined to x
-- x's ids are tuples of (number of associated ys, number of associated zs)
insert into xs (id) values ('0,0'), ('1,0'), ('2,0'), ('2,1'), ('2,2');
insert into ys (x_id) values ('1,0'), ('2,0'), ('2,0'), ('2,1'), ('2,1'), ('2,2'), ('2,2');
insert into zs (x_id) values ('2,1'), ('2,2'), ('2,2');
-- So, how does pg fill the rows in with the left join?
-- it cross joins each row on the left with each row on the right, but if there
-- are no rows on the right, it will still emit a row, with the right side set to null
select xs.id x, coalesce(ys.id::text, 'NULL') y, coalesce(zs.id::text, 'NULL') z
from xs
left join ys on xs.id = ys.x_id
left join zs on xs.id = zs.x_id
order by xs.id, zs.id, ys.id;
-- x | y | z
-- -----+------+------
-- 0,0 | NULL | NULL
-- 1,0 | 1 | NULL
-- 2,0 | 2 | NULL
-- 2,0 | 3 | NULL
-- 2,1 | 4 | 1
-- 2,1 | 5 | 1
-- 2,2 | 6 | 2
-- 2,2 | 7 | 2
-- 2,2 | 6 | 3
-- 2,2 | 7 | 3
-- (10 rows)
-- It's important to know that, because this works for all the examples until
-- you get to '2,2' and then it's suddenly incorrect! But ALSO, it would return
-- the expected value, if there weren't a second left join!
select
xs.id,
count(ys) as num_left,
count(zs) as num_right,
count(1) as num_rows
from xs
left join ys on xs.id = ys.x_id
left join zs on xs.id = zs.x_id
group by 1
order by 1;
-- id | num_left | num_right | num_rows
-- -----+----------+-----------+----------
-- 0,0 | 0 | 0 | 1
-- 1,0 | 1 | 0 | 1
-- 2,0 | 2 | 0 | 2
-- 2,1 | 2 | 2 | 2
-- 2,2 | 4 | 4 | 4
-- (5 rows)
-- This is the correct way to do it (counting distinct values in RHS)
select
xs.id,
count(distinct ys) as num_left,
count(distinct zs) as num_right,
count(1) as num_rows
from xs
left join ys on xs.id = ys.x_id
left join zs on xs.id = zs.x_id
group by 1
order by 1;
-- id | num_left | num_right | num_rows
-- -----+----------+-----------+----------
-- 0,0 | 0 | 0 | 1
-- 1,0 | 1 | 0 | 1
-- 2,0 | 2 | 0 | 2
-- 2,1 | 2 | 1 | 2
-- 2,2 | 2 | 2 | 4
-- (5 rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment