Skip to content

Instantly share code, notes, and snippets.

@xevix
Last active February 13, 2025 23:20
Show Gist options
  • Save xevix/bbdd266178827f085f94a96a91d1371d to your computer and use it in GitHub Desktop.
Save xevix/bbdd266178827f085f94a96a91d1371d to your computer and use it in GitHub Desktop.
-- Tested on 2024 MBP M4 Max 128G RAM 16 cores (12P, 4E), DuckDB 1.1.3, Sequoia 15.1
-- Taxi data set: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
-- Inspired by: https://duckdb.org/2024/10/16/driving-csv-performance-benchmarking-duckdb-with-the-nyc-taxi-dataset.html
-- taxi_data_2019: ~84M rows
-- UNPIVOT (naive)
-- ~5500ms (1.1.3)
-- ~4500ms (1.2)
-- ~330ms (2025/02/13 git HEAD) https://github.com/duckdb/duckdb/pull/16221
WITH locations AS (
UNPIVOT taxi_data_2019
ON
pickup_location_id AS pickup,
dropoff_location_id AS dropoff
INTO
NAME location_type
VALUE location_id
)
SELECT
location_type,
z.Borough,
COUNT(*) AS cnt
FROM
locations l,
zone_lookups z
WHERE
l.location_id = z.LocationID
GROUP BY
ALL
ORDER BY
1,
3 DESC
;
-- CTE + Unpivot ~90-100ms
WITH d AS (
SELECT
t.pickup_location_id,
t.dropoff_location_id,
COUNT(*) cnt
FROM
taxi_data_2019 t
GROUP BY
GROUPING SETS ((pickup_location_id),
(dropoff_location_id))
), d2 AS (
UNPIVOT d
ON pickup_location_id AS pickup, dropoff_location_id AS dropoff
INTO NAME l_type
VALUE location_id
)
SELECT l_type, z.Borough AS location, SUM(cnt) AS cnt
FROM d2, zone_lookups z
WHERE d2.location_id = z.LocationID
GROUP BY ALL
ORDER BY 1, 3 DESC;
-- CTEs + UNION
-- ~90-100ms
WITH d AS (
SELECT
t.pickup_location_id AS pickup,
t.dropoff_location_id AS dropoff,
COUNT(*) cnt
FROM
taxi_data_2019 t
GROUP BY
GROUPING SETS ((pickup_location_id),
(dropoff_location_id))
)
, d1 AS (
SELECT
'pickup' AS l_type,
pickup,
cnt
FROM
d
WHERE
pickup IS NOT NULL
),
d2 AS (
SELECT
'dropoff' AS l_type,
dropoff,
cnt
FROM
d
WHERE
dropoff IS NOT NULL
), d3 AS (
SELECT l_type, pickup AS location_id, cnt
FROM
d1
UNION ALL
SELECT l_type, dropoff, cnt
FROM
d2
)
SELECT l_type, z.Borough AS location, SUM(cnt) AS cnt
FROM d3, zone_lookups z
WHERE d3.location_id = z.LocationID
GROUP BY ALL
ORDER BY 1, 3 DESC;
-- GROUPING_ID ~90-100ms
WITH d AS (
SELECT
t.pickup_location_id,
t.dropoff_location_id,
GROUPING_ID(pickup_location_id, dropoff_location_id) AS gid,
COUNT(*) cnt
FROM
taxi_data_2019 t
GROUP BY
GROUPING SETS ((pickup_location_id),
(dropoff_location_id))
), d1 AS (
SELECT
gid,
zp.Borough AS location,
SUM(cnt) AS cnt
FROM
d,
zone_lookups zp
WHERE
COALESCE(d.pickup_location_id,
d.dropoff_location_id) = zp.LocationID
GROUP BY
ALL
)
SELECT
CASE
gid WHEN 1 THEN 'pickup'
ELSE 'dropoff'
END AS l_type,
location,
cnt
FROM
d1
ORDER BY
1,
3 DESC;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment