-- Tested on 2024 MBP M4 Max 128G RAM 16 cores (12P, 4E), DuckDB 1.1.3, Sequoia 15.1 -- Taxi data set: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page -- Inspired by: https://duckdb.org/2024/10/16/driving-csv-performance-benchmarking-duckdb-with-the-nyc-taxi-dataset.html -- taxi_data_2019: ~84M rows -- UNPIVOT (naive) -- ~5500ms (1.1.3) -- ~4500ms (1.2) -- ~330ms (2025/02/13 git HEAD) https://github.com/duckdb/duckdb/pull/16221 WITH locations AS ( UNPIVOT taxi_data_2019 ON pickup_location_id AS pickup, dropoff_location_id AS dropoff INTO NAME location_type VALUE location_id ) SELECT location_type, z.Borough, COUNT(*) AS cnt FROM locations l, zone_lookups z WHERE l.location_id = z.LocationID GROUP BY ALL ORDER BY 1, 3 DESC ; -- CTE + Unpivot ~90-100ms WITH d AS ( SELECT t.pickup_location_id, t.dropoff_location_id, COUNT(*) cnt FROM taxi_data_2019 t GROUP BY GROUPING SETS ((pickup_location_id), (dropoff_location_id)) ), d2 AS ( UNPIVOT d ON pickup_location_id AS pickup, dropoff_location_id AS dropoff INTO NAME l_type VALUE location_id ) SELECT l_type, z.Borough AS location, SUM(cnt) AS cnt FROM d2, zone_lookups z WHERE d2.location_id = z.LocationID GROUP BY ALL ORDER BY 1, 3 DESC; -- CTEs + UNION -- ~90-100ms WITH d AS ( SELECT t.pickup_location_id AS pickup, t.dropoff_location_id AS dropoff, COUNT(*) cnt FROM taxi_data_2019 t GROUP BY GROUPING SETS ((pickup_location_id), (dropoff_location_id)) ) , d1 AS ( SELECT 'pickup' AS l_type, pickup, cnt FROM d WHERE pickup IS NOT NULL ), d2 AS ( SELECT 'dropoff' AS l_type, dropoff, cnt FROM d WHERE dropoff IS NOT NULL ), d3 AS ( SELECT l_type, pickup AS location_id, cnt FROM d1 UNION ALL SELECT l_type, dropoff, cnt FROM d2 ) SELECT l_type, z.Borough AS location, SUM(cnt) AS cnt FROM d3, zone_lookups z WHERE d3.location_id = z.LocationID GROUP BY ALL ORDER BY 1, 3 DESC; -- GROUPING_ID ~90-100ms WITH d AS ( SELECT t.pickup_location_id, t.dropoff_location_id, GROUPING_ID(pickup_location_id, dropoff_location_id) AS gid, COUNT(*) cnt FROM taxi_data_2019 t GROUP BY GROUPING SETS ((pickup_location_id), (dropoff_location_id)) ), d1 AS ( SELECT gid, zp.Borough AS location, SUM(cnt) AS cnt FROM d, zone_lookups zp WHERE COALESCE(d.pickup_location_id, d.dropoff_location_id) = zp.LocationID GROUP BY ALL ) SELECT CASE gid WHEN 1 THEN 'pickup' ELSE 'dropoff' END AS l_type, location, cnt FROM d1 ORDER BY 1, 3 DESC;