Last active
March 9, 2022 21:15
-
-
Save JoshCheek/aac5ef253897a636acc63d433650cbd4 to your computer and use it in GitHub Desktop.
Left join's "gotcha": Counting number of associated rows
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| create table xs (id varchar primary key); | |
| create table ys (id serial primary key, x_id varchar); -- first value joined to x | |
| create table zs (id serial primary key, x_id varchar); -- second value joined to x | |
| -- x's ids are tuples of (number of associated ys, number of associated zs) | |
| insert into xs (id) values ('0,0'), ('1,0'), ('2,0'), ('2,1'), ('2,2'); | |
| insert into ys (x_id) values ('1,0'), ('2,0'), ('2,0'), ('2,1'), ('2,1'), ('2,2'), ('2,2'); | |
| insert into zs (x_id) values ('2,1'), ('2,2'), ('2,2'); | |
| -- So, how does pg fill the rows in with the left join? | |
| -- it cross joins each row on the left with each row on the right, but if there | |
| -- are no rows on the right, it will still emit a row, with the right side set to null | |
| select xs.id x, coalesce(ys.id::text, 'NULL') y, coalesce(zs.id::text, 'NULL') z | |
| from xs | |
| left join ys on xs.id = ys.x_id | |
| left join zs on xs.id = zs.x_id | |
| order by xs.id, zs.id, ys.id; | |
| -- x | y | z | |
| -- -----+------+------ | |
| -- 0,0 | NULL | NULL | |
| -- 1,0 | 1 | NULL | |
| -- 2,0 | 2 | NULL | |
| -- 2,0 | 3 | NULL | |
| -- 2,1 | 4 | 1 | |
| -- 2,1 | 5 | 1 | |
| -- 2,2 | 6 | 2 | |
| -- 2,2 | 7 | 2 | |
| -- 2,2 | 6 | 3 | |
| -- 2,2 | 7 | 3 | |
| -- (10 rows) | |
| -- It's important to know that, because this works for all the examples until | |
| -- you get to '2,2' and then it's suddenly incorrect! But ALSO, it would return | |
| -- the expected value, if there weren't a second left join! | |
| select | |
| xs.id, | |
| count(ys) as num_left, | |
| count(zs) as num_right, | |
| count(1) as num_rows | |
| from xs | |
| left join ys on xs.id = ys.x_id | |
| left join zs on xs.id = zs.x_id | |
| group by 1 | |
| order by 1; | |
| -- id | num_left | num_right | num_rows | |
| -- -----+----------+-----------+---------- | |
| -- 0,0 | 0 | 0 | 1 | |
| -- 1,0 | 1 | 0 | 1 | |
| -- 2,0 | 2 | 0 | 2 | |
| -- 2,1 | 2 | 2 | 2 | |
| -- 2,2 | 4 | 4 | 4 | |
| -- (5 rows) | |
| -- This is the correct way to do it (counting distinct values in RHS) | |
| select | |
| xs.id, | |
| count(distinct ys) as num_left, | |
| count(distinct zs) as num_right, | |
| count(1) as num_rows | |
| from xs | |
| left join ys on xs.id = ys.x_id | |
| left join zs on xs.id = zs.x_id | |
| group by 1 | |
| order by 1; | |
| -- id | num_left | num_right | num_rows | |
| -- -----+----------+-----------+---------- | |
| -- 0,0 | 0 | 0 | 1 | |
| -- 1,0 | 1 | 0 | 1 | |
| -- 2,0 | 2 | 0 | 2 | |
| -- 2,1 | 2 | 1 | 2 | |
| -- 2,2 | 2 | 2 | 4 | |
| -- (5 rows) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment