Skip to content

Instantly share code, notes, and snippets.

@ar-puuk
Created February 10, 2026 04:41
Show Gist options
  • Select an option

  • Save ar-puuk/164ab28e8cf48d5a3ba81eb5ad9faedd to your computer and use it in GitHub Desktop.

Select an option

Save ar-puuk/164ab28e8cf48d5a3ba81eb5ad9faedd to your computer and use it in GitHub Desktop.
library(bench)
library(dplyr)
library(arrow)
library(polars)
library(duckdb)
# 1. Define Paths
csv_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv"
parquet_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.parquet"
# 2. Convert CSV to Parquet (One-time setup)
if (!file.exists(parquet_path)) {
message("Converting CSV to Parquet... this may take a minute.")
df <- arrow::read_csv_arrow(csv_path)
arrow::write_parquet(df, parquet_path)
message("Conversion complete: ", parquet_path)
}
# 3. DuckDB Connection
con <- dbConnect(duckdb())
# 4. Define the Polars Function (Fixed)
# We use standard object$method() syntax instead of piping into non-existent pl$ functions
do_polars_agg <- function(lazy_frame) {
# Apply filter on the object 'lazy_frame'
lf <- lazy_frame$filter(
(pl$col("year_built") > 1900) & (pl$col("parcel_acres") > 0.01)
)
# Apply groupby and aggregation
lf <- lf$group_by("CO_NAME", "building_type")$agg(
pl$len()$alias("Parcel_Count"),
pl$col("parcel_acres")$sum()$alias("Total_Acres"),
(pl$col("total_market_value")$sum() / pl$col("parcel_acres")$sum())$alias("Value_Per_Acre")
)
# Execute
lf$collect()
}
# 5. Run Benchmark
results_io <- bench::mark(
# --- System 1: Polars (CSV) ---
polars_csv = {
# Scan CSV -> Pass to function
pl$scan_csv(csv_path) |> do_polars_agg()
},
# --- System 2: Polars (Parquet) ---
polars_parquet = {
# Scan Parquet -> Pass to function
pl$scan_parquet(parquet_path) |> do_polars_agg()
},
# --- System 3: DuckDB (CSV) ---
duckdb_csv = {
query <- "
SELECT
CO_NAME,
building_type,
COUNT(*) as Parcel_Count,
SUM(parcel_acres) as Total_Acres,
SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre
FROM 'file_path_placeholder'
WHERE year_built > 1900 AND parcel_acres > 0.01
GROUP BY CO_NAME, building_type
"
dbGetQuery(con, sub("file_path_placeholder", csv_path, query))
},
# --- System 4: DuckDB (Parquet) ---
duckdb_parquet = {
query <- "
SELECT
CO_NAME,
building_type,
COUNT(*) as Parcel_Count,
SUM(parcel_acres) as Total_Acres,
SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre
FROM read_parquet('file_path_placeholder')
WHERE year_built > 1900 AND parcel_acres > 0.01
GROUP BY CO_NAME, building_type
"
dbGetQuery(con, sub("file_path_placeholder", parquet_path, query))
},
iterations = 10,
check = FALSE
)
# 6. Clean up
dbDisconnect(con)
# 7. Output
print(results_io)
plot(results_io)
# 8. File Size Comparison
cat("CSV Size: ", round(file.size(csv_path) / 1e6, 2), "MB\n")
cat("Parquet Size: ", round(file.size(parquet_path) / 1e6, 2), "MB\n")
library(bench)
library(dplyr)
library(readr)
library(data.table)
library(polars)
library(duckdb)
# 1. Path
file_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv"
# 2. DuckDB Connection
con <- dbConnect(duckdb())
# 3. Benchmark
results_agg <- bench::mark(
# --- System 1: dplyr ---
dplyr = {
read_csv(file_path, show_col_types = FALSE, progress = FALSE) |>
filter(year_built > 1900, parcel_acres > 0.01) |>
group_by(CO_NAME, building_type) |>
summarise(
Parcel_Count = n(),
Total_Acres = sum(parcel_acres, na.rm = TRUE),
Value_Per_Acre = sum(total_market_value, na.rm = TRUE) / sum(parcel_acres, na.rm = TRUE),
.groups = "drop"
)
},
# --- System 2: data.table ---
data.table = {
# Read only necessary columns
dt <- fread(file_path, select = c("CO_NAME", "building_type", "year_built", "parcel_acres", "total_market_value"))
# Perform aggregation
dt[year_built > 1900 & parcel_acres > 0.01,
.(Parcel_Count = .N,
Total_Acres = sum(parcel_acres, na.rm = TRUE),
# Cast to numeric to avoid integer overflow warning
Value_Per_Acre = sum(as.numeric(total_market_value), na.rm = TRUE) / sum(parcel_acres, na.rm = TRUE)),
by = .(CO_NAME, building_type)]
},
# --- System 3: polars ---
polars = {
# We define the plan first
lf <- pl$scan_csv(file_path)
# Filter
lf <- lf$filter(
(pl$col("year_built") > 1900) & (pl$col("parcel_acres") > 0.01)
)
# Group and Agg
# Note: explicitly constructing the list of expressions
lf <- lf$group_by("CO_NAME", "building_type")$agg(
pl$len()$alias("Parcel_Count"),
pl$col("parcel_acres")$sum()$alias("Total_Acres"),
(pl$col("total_market_value")$sum() / pl$col("parcel_acres")$sum())$alias("Value_Per_Acre")
)
# Collect
lf$collect()
},
# --- System 4: duckdb ---
duckdb = {
# Define query template
base_query <- "
SELECT
CO_NAME,
building_type,
COUNT(*) as Parcel_Count,
SUM(parcel_acres) as Total_Acres,
SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre
FROM 'file_path_placeholder'
WHERE year_built > 1900 AND parcel_acres > 0.01
GROUP BY CO_NAME, building_type
"
# Inject path
final_query <- sub("file_path_placeholder", file_path, base_query)
# Execute
dbGetQuery(con, final_query)
},
iterations = 5,
check = FALSE
)
# 4. Clean up
dbDisconnect(con)
# 5. Output
print(results_agg)
plot(results_agg)
library(bench)
library(dplyr)
library(readr)
library(data.table)
library(polars)
library(duckdb)
# 1. Define path (Using forward slashes to avoid escape character issues)
file_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv"
# 2. Setup DuckDB connection (needed for the query)
con <- dbConnect(duckdb())
# 3. Run Benchmark
# check = FALSE is used because the return types differ (integer vs double vs data.frame)
results <- bench::mark(
# System 1: dplyr / readr
# Note: Standard read_csv reads the whole file into memory.
# This will likely be the slowest and most memory-intensive.
dplyr = {
read_csv(file_path, show_col_types = FALSE, progress = FALSE) |>
nrow()
},
# System 2: data.table
# fread is highly optimized for CSV reading.
data.table = {
fread(file_path) |>
nrow()
},
# System 3: polars
# Uses lazy execution (scan_csv) to count without loading everything.
polars = {
pl$scan_csv(file_path)$select(pl$len())$collect()
},
# System 4: duckdb
# SQL query directly on the CSV file (very fast).
duckdb = {
dbGetQuery(con, paste0("SELECT count(*) FROM '", file_path, "'"))
},
iterations = 5, # Low iteration count due to large file size
check = FALSE
)
# 4. Clean up
dbDisconnect(con)
# 5. View Results
print(results)
plot(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment