JoshCheek · March 9, 2022 21:15
diff --git a/left_joins_gotcha_counting_number_of_associated_rows.sql b/left_joins_gotcha_counting_number_of_associated_rows.sql
 create table xs (id varchar primary key);
 create table ys (id serial primary key, x_id varchar); -- first value joined to x
 create table zs (id serial primary key, x_id varchar); -- second value joined to x

 -- x's ids are tuples of (number of associated ys, number of associated zs)
 insert into xs (id) values ('0,0'), ('1,0'), ('2,0'), ('2,1'), ('2,2');
 insert into ys (x_id) values ('1,0'), ('2,0'), ('2,0'), ('2,1'), ('2,1'), ('2,2'), ('2,2');
 insert into zs (x_id) values ('2,1'), ('2,2'), ('2,2');

 -- So, how does pg fill the rows in with the left join?
 -- it cross joins each row on the left with each row on the right, but if there
 -- are no rows on the right, it will still emit a row, with the right side set to null
 select xs.id x, coalesce(ys.id::text, 'NULL') y, coalesce(zs.id::text, 'NULL') z
 from xs
 left join ys on xs.id = ys.x_id
 left join zs on xs.id = zs.x_id
 order by xs.id, zs.id, ys.id;

  -- x  |  y   |  z
  -- -----+------+------
  -- 0,0 | NULL | NULL
  -- 1,0 | 1    | NULL
  -- 2,0 | 2    | NULL
  -- 2,0 | 3    | NULL
  -- 2,1 | 4    | 1
  -- 2,1 | 5    | 1
  -- 2,2 | 6    | 2
  -- 2,2 | 7    | 2
  -- 2,2 | 6    | 3
  -- 2,2 | 7    | 3
  -- (10 rows)


 -- It's important to know that, because this works for all the examples until
 -- you get to '2,2' and then it's suddenly incorrect! But ALSO, it would return
 -- the expected value, if there weren't a second left join!
 select
  xs.id,
  count(ys) as num_left,
  count(zs) as num_right,
  count(1) as num_rows
 from xs
 left join ys on xs.id = ys.x_id
 left join zs on xs.id = zs.x_id
 group by 1
 order by 1;

  -- id  | num_left | num_right | num_rows
  -- -----+----------+-----------+----------
  -- 0,0 |        0 |         0 |        1
  -- 1,0 |        1 |         0 |        1
  -- 2,0 |        2 |         0 |        2
  -- 2,1 |        2 |         2 |        2
  -- 2,2 |        4 |         4 |        4
  -- (5 rows)


 -- This is the correct way to do it (counting distinct values in RHS)
 select
  xs.id,
  count(distinct ys) as num_left,
  count(distinct zs) as num_right,
  count(1) as num_rows
 from xs
 left join ys on xs.id = ys.x_id
 left join zs on xs.id = zs.x_id
 group by 1
 order by 1;

  -- id  | num_left | num_right | num_rows
  -- -----+----------+-----------+----------
  -- 0,0 |        0 |         0 |        1
  -- 1,0 |        1 |         0 |        1
  -- 2,0 |        2 |         0 |        2
  -- 2,1 |        2 |         1 |        2
  -- 2,2 |        2 |         2 |        4
  -- (5 rows)
	create table xs (id varchar primary key);
	create table ys (id serial primary key, x_id varchar); -- first value joined to x
	create table zs (id serial primary key, x_id varchar); -- second value joined to x

	-- x's ids are tuples of (number of associated ys, number of associated zs)
	insert into xs (id) values ('0,0'), ('1,0'), ('2,0'), ('2,1'), ('2,2');
	insert into ys (x_id) values ('1,0'), ('2,0'), ('2,0'), ('2,1'), ('2,1'), ('2,2'), ('2,2');
	insert into zs (x_id) values ('2,1'), ('2,2'), ('2,2');

	-- So, how does pg fill the rows in with the left join?
	-- it cross joins each row on the left with each row on the right, but if there
	-- are no rows on the right, it will still emit a row, with the right side set to null
	select xs.id x, coalesce(ys.id::text, 'NULL') y, coalesce(zs.id::text, 'NULL') z
	from xs
	left join ys on xs.id = ys.x_id
	left join zs on xs.id = zs.x_id
	order by xs.id, zs.id, ys.id;

	-- x \| y \| z
	-- -----+------+------
	-- 0,0 \| NULL \| NULL
	-- 1,0 \| 1 \| NULL
	-- 2,0 \| 2 \| NULL
	-- 2,0 \| 3 \| NULL
	-- 2,1 \| 4 \| 1
	-- 2,1 \| 5 \| 1
	-- 2,2 \| 6 \| 2
	-- 2,2 \| 7 \| 2
	-- 2,2 \| 6 \| 3
	-- 2,2 \| 7 \| 3
	-- (10 rows)


	-- It's important to know that, because this works for all the examples until
	-- you get to '2,2' and then it's suddenly incorrect! But ALSO, it would return
	-- the expected value, if there weren't a second left join!
	select
	xs.id,
	count(ys) as num_left,
	count(zs) as num_right,
	count(1) as num_rows
	from xs
	left join ys on xs.id = ys.x_id
	left join zs on xs.id = zs.x_id
	group by 1
	order by 1;

	-- id \| num_left \| num_right \| num_rows
	-- -----+----------+-----------+----------
	-- 0,0 \| 0 \| 0 \| 1
	-- 1,0 \| 1 \| 0 \| 1
	-- 2,0 \| 2 \| 0 \| 2
	-- 2,1 \| 2 \| 2 \| 2
	-- 2,2 \| 4 \| 4 \| 4
	-- (5 rows)


	-- This is the correct way to do it (counting distinct values in RHS)
	select
	xs.id,
	count(distinct ys) as num_left,
	count(distinct zs) as num_right,
	count(1) as num_rows
	from xs
	left join ys on xs.id = ys.x_id
	left join zs on xs.id = zs.x_id
	group by 1
	order by 1;

	-- id \| num_left \| num_right \| num_rows
	-- -----+----------+-----------+----------
	-- 0,0 \| 0 \| 0 \| 1
	-- 1,0 \| 1 \| 0 \| 1
	-- 2,0 \| 2 \| 0 \| 2
	-- 2,1 \| 2 \| 1 \| 2
	-- 2,2 \| 2 \| 2 \| 4
	-- (5 rows)