Created
February 10, 2026 04:41
-
-
Save ar-puuk/164ab28e8cf48d5a3ba81eb5ad9faedd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(bench) | |
| library(dplyr) | |
| library(arrow) | |
| library(polars) | |
| library(duckdb) | |
| # 1. Define Paths | |
| csv_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv" | |
| parquet_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.parquet" | |
| # 2. Convert CSV to Parquet (One-time setup) | |
| if (!file.exists(parquet_path)) { | |
| message("Converting CSV to Parquet... this may take a minute.") | |
| df <- arrow::read_csv_arrow(csv_path) | |
| arrow::write_parquet(df, parquet_path) | |
| message("Conversion complete: ", parquet_path) | |
| } | |
| # 3. DuckDB Connection | |
| con <- dbConnect(duckdb()) | |
| # 4. Define the Polars Function (Fixed) | |
| # We use standard object$method() syntax instead of piping into non-existent pl$ functions | |
| do_polars_agg <- function(lazy_frame) { | |
| # Apply filter on the object 'lazy_frame' | |
| lf <- lazy_frame$filter( | |
| (pl$col("year_built") > 1900) & (pl$col("parcel_acres") > 0.01) | |
| ) | |
| # Apply groupby and aggregation | |
| lf <- lf$group_by("CO_NAME", "building_type")$agg( | |
| pl$len()$alias("Parcel_Count"), | |
| pl$col("parcel_acres")$sum()$alias("Total_Acres"), | |
| (pl$col("total_market_value")$sum() / pl$col("parcel_acres")$sum())$alias("Value_Per_Acre") | |
| ) | |
| # Execute | |
| lf$collect() | |
| } | |
| # 5. Run Benchmark | |
| results_io <- bench::mark( | |
| # --- System 1: Polars (CSV) --- | |
| polars_csv = { | |
| # Scan CSV -> Pass to function | |
| pl$scan_csv(csv_path) |> do_polars_agg() | |
| }, | |
| # --- System 2: Polars (Parquet) --- | |
| polars_parquet = { | |
| # Scan Parquet -> Pass to function | |
| pl$scan_parquet(parquet_path) |> do_polars_agg() | |
| }, | |
| # --- System 3: DuckDB (CSV) --- | |
| duckdb_csv = { | |
| query <- " | |
| SELECT | |
| CO_NAME, | |
| building_type, | |
| COUNT(*) as Parcel_Count, | |
| SUM(parcel_acres) as Total_Acres, | |
| SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre | |
| FROM 'file_path_placeholder' | |
| WHERE year_built > 1900 AND parcel_acres > 0.01 | |
| GROUP BY CO_NAME, building_type | |
| " | |
| dbGetQuery(con, sub("file_path_placeholder", csv_path, query)) | |
| }, | |
| # --- System 4: DuckDB (Parquet) --- | |
| duckdb_parquet = { | |
| query <- " | |
| SELECT | |
| CO_NAME, | |
| building_type, | |
| COUNT(*) as Parcel_Count, | |
| SUM(parcel_acres) as Total_Acres, | |
| SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre | |
| FROM read_parquet('file_path_placeholder') | |
| WHERE year_built > 1900 AND parcel_acres > 0.01 | |
| GROUP BY CO_NAME, building_type | |
| " | |
| dbGetQuery(con, sub("file_path_placeholder", parquet_path, query)) | |
| }, | |
| iterations = 10, | |
| check = FALSE | |
| ) | |
| # 6. Clean up | |
| dbDisconnect(con) | |
| # 7. Output | |
| print(results_io) | |
| plot(results_io) | |
| # 8. File Size Comparison | |
| cat("CSV Size: ", round(file.size(csv_path) / 1e6, 2), "MB\n") | |
| cat("Parquet Size: ", round(file.size(parquet_path) / 1e6, 2), "MB\n") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(bench) | |
| library(dplyr) | |
| library(readr) | |
| library(data.table) | |
| library(polars) | |
| library(duckdb) | |
| # 1. Path | |
| file_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv" | |
| # 2. DuckDB Connection | |
| con <- dbConnect(duckdb()) | |
| # 3. Benchmark | |
| results_agg <- bench::mark( | |
| # --- System 1: dplyr --- | |
| dplyr = { | |
| read_csv(file_path, show_col_types = FALSE, progress = FALSE) |> | |
| filter(year_built > 1900, parcel_acres > 0.01) |> | |
| group_by(CO_NAME, building_type) |> | |
| summarise( | |
| Parcel_Count = n(), | |
| Total_Acres = sum(parcel_acres, na.rm = TRUE), | |
| Value_Per_Acre = sum(total_market_value, na.rm = TRUE) / sum(parcel_acres, na.rm = TRUE), | |
| .groups = "drop" | |
| ) | |
| }, | |
| # --- System 2: data.table --- | |
| data.table = { | |
| # Read only necessary columns | |
| dt <- fread(file_path, select = c("CO_NAME", "building_type", "year_built", "parcel_acres", "total_market_value")) | |
| # Perform aggregation | |
| dt[year_built > 1900 & parcel_acres > 0.01, | |
| .(Parcel_Count = .N, | |
| Total_Acres = sum(parcel_acres, na.rm = TRUE), | |
| # Cast to numeric to avoid integer overflow warning | |
| Value_Per_Acre = sum(as.numeric(total_market_value), na.rm = TRUE) / sum(parcel_acres, na.rm = TRUE)), | |
| by = .(CO_NAME, building_type)] | |
| }, | |
| # --- System 3: polars --- | |
| polars = { | |
| # We define the plan first | |
| lf <- pl$scan_csv(file_path) | |
| # Filter | |
| lf <- lf$filter( | |
| (pl$col("year_built") > 1900) & (pl$col("parcel_acres") > 0.01) | |
| ) | |
| # Group and Agg | |
| # Note: explicitly constructing the list of expressions | |
| lf <- lf$group_by("CO_NAME", "building_type")$agg( | |
| pl$len()$alias("Parcel_Count"), | |
| pl$col("parcel_acres")$sum()$alias("Total_Acres"), | |
| (pl$col("total_market_value")$sum() / pl$col("parcel_acres")$sum())$alias("Value_Per_Acre") | |
| ) | |
| # Collect | |
| lf$collect() | |
| }, | |
| # --- System 4: duckdb --- | |
| duckdb = { | |
| # Define query template | |
| base_query <- " | |
| SELECT | |
| CO_NAME, | |
| building_type, | |
| COUNT(*) as Parcel_Count, | |
| SUM(parcel_acres) as Total_Acres, | |
| SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre | |
| FROM 'file_path_placeholder' | |
| WHERE year_built > 1900 AND parcel_acres > 0.01 | |
| GROUP BY CO_NAME, building_type | |
| " | |
| # Inject path | |
| final_query <- sub("file_path_placeholder", file_path, base_query) | |
| # Execute | |
| dbGetQuery(con, final_query) | |
| }, | |
| iterations = 5, | |
| check = FALSE | |
| ) | |
| # 4. Clean up | |
| dbDisconnect(con) | |
| # 5. Output | |
| print(results_agg) | |
| plot(results_agg) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(bench) | |
| library(dplyr) | |
| library(readr) | |
| library(data.table) | |
| library(polars) | |
| library(duckdb) | |
| # 1. Define path (Using forward slashes to avoid escape character issues) | |
| file_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv" | |
| # 2. Setup DuckDB connection (needed for the query) | |
| con <- dbConnect(duckdb()) | |
| # 3. Run Benchmark | |
| # check = FALSE is used because the return types differ (integer vs double vs data.frame) | |
| results <- bench::mark( | |
| # System 1: dplyr / readr | |
| # Note: Standard read_csv reads the whole file into memory. | |
| # This will likely be the slowest and most memory-intensive. | |
| dplyr = { | |
| read_csv(file_path, show_col_types = FALSE, progress = FALSE) |> | |
| nrow() | |
| }, | |
| # System 2: data.table | |
| # fread is highly optimized for CSV reading. | |
| data.table = { | |
| fread(file_path) |> | |
| nrow() | |
| }, | |
| # System 3: polars | |
| # Uses lazy execution (scan_csv) to count without loading everything. | |
| polars = { | |
| pl$scan_csv(file_path)$select(pl$len())$collect() | |
| }, | |
| # System 4: duckdb | |
| # SQL query directly on the CSV file (very fast). | |
| duckdb = { | |
| dbGetQuery(con, paste0("SELECT count(*) FROM '", file_path, "'")) | |
| }, | |
| iterations = 5, # Low iteration count due to large file size | |
| check = FALSE | |
| ) | |
| # 4. Clean up | |
| dbDisconnect(con) | |
| # 5. View Results | |
| print(results) | |
| plot(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment