Why are these wildly different?
WITH
gocd_events as (
SELECT * FROM `di-dev-mtr--buck-5.devinfra_metrics.gocd_events`
)
SELECT *
FROM
`gocd_events` as log,
`gocd_events` as pipeline
WHERE
STARTS_WITH(log.gcs_prefix, pipeline.gcs_prefix) AND
log.pipeline_name is NULL AND
pipeline.pipeline_name is NOT NULL
limit 10
Elapsed time 608 ms Slot time consumed 20 sec Bytes shuffled 131.83 MB
SELECT *
FROM
`gocd_events` as log
LEFT OUTER JOIN `gocd_events` as pipeline
ON
STARTS_WITH(log.gcs_prefix, pipeline.gcs_prefix) AND
log.pipeline_name is NULL AND
pipeline.pipeline_name is NOT NULL
limit 10
Elapsed time 16 sec Slot time consumed 7 hr 46 min Bytes shuffled 4.51 GB
WITH
gocd_events as (
SELECT * FROM `di-dev-mtr--buck-5.devinfra_metrics.gocd_events`
)
SELECT *
FROM (
SELECT * from `gocd_events`
WHERE pipeline_name is NULL
) as log
LEFT OUTER JOIN (
SELECT * from `gocd_events`
WHERE pipeline_name is NOT NULL
) as pipeline
ON
STARTS_WITH(log.gcs_prefix, pipeline.gcs_prefix)
limit 10
Elapsed time 4 sec Slot time consumed 1 hr 54 min Bytes shuffled 2.11 GB
There's zero records with null gcs_prefix so I'd expect them to be exactly equal.
I do want the left-join behavior though -- I want to see logs that have no matching pipeline.