Last active
February 13, 2025 23:20
-
-
Save xevix/bbdd266178827f085f94a96a91d1371d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Tested on 2024 MBP M4 Max 128G RAM 16 cores (12P, 4E), DuckDB 1.1.3, Sequoia 15.1 | |
-- Taxi data set: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page | |
-- Inspired by: https://duckdb.org/2024/10/16/driving-csv-performance-benchmarking-duckdb-with-the-nyc-taxi-dataset.html | |
-- taxi_data_2019: ~84M rows | |
-- UNPIVOT (naive) | |
-- ~5500ms (1.1.3) | |
-- ~4500ms (1.2) | |
-- ~330ms (2025/02/13 git HEAD) https://github.com/duckdb/duckdb/pull/16221 | |
WITH locations AS ( | |
UNPIVOT taxi_data_2019 | |
ON | |
pickup_location_id AS pickup, | |
dropoff_location_id AS dropoff | |
INTO | |
NAME location_type | |
VALUE location_id | |
) | |
SELECT | |
location_type, | |
z.Borough, | |
COUNT(*) AS cnt | |
FROM | |
locations l, | |
zone_lookups z | |
WHERE | |
l.location_id = z.LocationID | |
GROUP BY | |
ALL | |
ORDER BY | |
1, | |
3 DESC | |
; | |
-- CTE + Unpivot ~90-100ms | |
WITH d AS ( | |
SELECT | |
t.pickup_location_id, | |
t.dropoff_location_id, | |
COUNT(*) cnt | |
FROM | |
taxi_data_2019 t | |
GROUP BY | |
GROUPING SETS ((pickup_location_id), | |
(dropoff_location_id)) | |
), d2 AS ( | |
UNPIVOT d | |
ON pickup_location_id AS pickup, dropoff_location_id AS dropoff | |
INTO NAME l_type | |
VALUE location_id | |
) | |
SELECT l_type, z.Borough AS location, SUM(cnt) AS cnt | |
FROM d2, zone_lookups z | |
WHERE d2.location_id = z.LocationID | |
GROUP BY ALL | |
ORDER BY 1, 3 DESC; | |
-- CTEs + UNION | |
-- ~90-100ms | |
WITH d AS ( | |
SELECT | |
t.pickup_location_id AS pickup, | |
t.dropoff_location_id AS dropoff, | |
COUNT(*) cnt | |
FROM | |
taxi_data_2019 t | |
GROUP BY | |
GROUPING SETS ((pickup_location_id), | |
(dropoff_location_id)) | |
) | |
, d1 AS ( | |
SELECT | |
'pickup' AS l_type, | |
pickup, | |
cnt | |
FROM | |
d | |
WHERE | |
pickup IS NOT NULL | |
), | |
d2 AS ( | |
SELECT | |
'dropoff' AS l_type, | |
dropoff, | |
cnt | |
FROM | |
d | |
WHERE | |
dropoff IS NOT NULL | |
), d3 AS ( | |
SELECT l_type, pickup AS location_id, cnt | |
FROM | |
d1 | |
UNION ALL | |
SELECT l_type, dropoff, cnt | |
FROM | |
d2 | |
) | |
SELECT l_type, z.Borough AS location, SUM(cnt) AS cnt | |
FROM d3, zone_lookups z | |
WHERE d3.location_id = z.LocationID | |
GROUP BY ALL | |
ORDER BY 1, 3 DESC; | |
-- GROUPING_ID ~90-100ms | |
WITH d AS ( | |
SELECT | |
t.pickup_location_id, | |
t.dropoff_location_id, | |
GROUPING_ID(pickup_location_id, dropoff_location_id) AS gid, | |
COUNT(*) cnt | |
FROM | |
taxi_data_2019 t | |
GROUP BY | |
GROUPING SETS ((pickup_location_id), | |
(dropoff_location_id)) | |
), d1 AS ( | |
SELECT | |
gid, | |
zp.Borough AS location, | |
SUM(cnt) AS cnt | |
FROM | |
d, | |
zone_lookups zp | |
WHERE | |
COALESCE(d.pickup_location_id, | |
d.dropoff_location_id) = zp.LocationID | |
GROUP BY | |
ALL | |
) | |
SELECT | |
CASE | |
gid WHEN 1 THEN 'pickup' | |
ELSE 'dropoff' | |
END AS l_type, | |
location, | |
cnt | |
FROM | |
d1 | |
ORDER BY | |
1, | |
3 DESC; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment