ar-puuk · February 10, 2026 04:41
diff --git a/csv_vs_parquet.R b/csv_vs_parquet.R
 library(bench)
 library(dplyr)
 library(arrow)
 library(polars)
 library(duckdb)

 # 1. Define Paths
 csv_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv"
 parquet_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.parquet"

 # 2. Convert CSV to Parquet (One-time setup)
 if (!file.exists(parquet_path)) {
  message("Converting CSV to Parquet... this may take a minute.")
  df <- arrow::read_csv_arrow(csv_path)
  arrow::write_parquet(df, parquet_path)
  message("Conversion complete: ", parquet_path)
 }

 # 3. DuckDB Connection
 con <- dbConnect(duckdb())

 # 4. Define the Polars Function (Fixed)
 # We use standard object$method() syntax instead of piping into non-existent pl$ functions
 do_polars_agg <- function(lazy_frame) {
  # Apply filter on the object 'lazy_frame'
  lf <- lazy_frame$filter(
    (pl$col("year_built") > 1900) & (pl$col("parcel_acres") > 0.01)
  )
  
  # Apply groupby and aggregation
  lf <- lf$group_by("CO_NAME", "building_type")$agg(
    pl$len()$alias("Parcel_Count"),
    pl$col("parcel_acres")$sum()$alias("Total_Acres"),
    (pl$col("total_market_value")$sum() / pl$col("parcel_acres")$sum())$alias("Value_Per_Acre")
  )
  
  # Execute
  lf$collect()
 }

 # 5. Run Benchmark
 results_io <- bench::mark(
  
  # --- System 1: Polars (CSV) ---
  polars_csv = {
    # Scan CSV -> Pass to function
    pl$scan_csv(csv_path) |> do_polars_agg()
  },
  
  # --- System 2: Polars (Parquet) ---
  polars_parquet = {
    # Scan Parquet -> Pass to function
    pl$scan_parquet(parquet_path) |> do_polars_agg()
  },
  
  # --- System 3: DuckDB (CSV) ---
  duckdb_csv = {
    query <- "
      SELECT 
        CO_NAME, 
        building_type, 
        COUNT(*) as Parcel_Count, 
        SUM(parcel_acres) as Total_Acres,
        SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre
      FROM 'file_path_placeholder'
      WHERE year_built > 1900 AND parcel_acres > 0.01
      GROUP BY CO_NAME, building_type
    "
    dbGetQuery(con, sub("file_path_placeholder", csv_path, query))
  },
  
  # --- System 4: DuckDB (Parquet) ---
  duckdb_parquet = {
    query <- "
      SELECT 
        CO_NAME, 
        building_type, 
        COUNT(*) as Parcel_Count, 
        SUM(parcel_acres) as Total_Acres,
        SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre
      FROM read_parquet('file_path_placeholder')
      WHERE year_built > 1900 AND parcel_acres > 0.01
      GROUP BY CO_NAME, building_type
    "
    dbGetQuery(con, sub("file_path_placeholder", parquet_path, query))
  },
  
  iterations = 10,
  check = FALSE
 )

 # 6. Clean up
 dbDisconnect(con)

 # 7. Output
 print(results_io)
 plot(results_io)

 # 8. File Size Comparison
 cat("CSV Size:     ", round(file.size(csv_path) / 1e6, 2), "MB\n")
 cat("Parquet Size: ", round(file.size(parquet_path) / 1e6, 2), "MB\n")
diff --git a/remm_aggregation.R b/remm_aggregation.R
 library(bench)
 library(dplyr)
 library(readr)
 library(data.table)
 library(polars)
 library(duckdb)

 # 1. Path
 file_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv"

 # 2. DuckDB Connection
 con <- dbConnect(duckdb())

 # 3. Benchmark
 results_agg <- bench::mark(
  
  # --- System 1: dplyr ---
  dplyr = {
    read_csv(file_path, show_col_types = FALSE, progress = FALSE) |> 
      filter(year_built > 1900, parcel_acres > 0.01) |> 
      group_by(CO_NAME, building_type) |> 
      summarise(
        Parcel_Count = n(),
        Total_Acres = sum(parcel_acres, na.rm = TRUE),
        Value_Per_Acre = sum(total_market_value, na.rm = TRUE) / sum(parcel_acres, na.rm = TRUE),
        .groups = "drop"
      )
  },
  
  # --- System 2: data.table ---
  data.table = {
    # Read only necessary columns
    dt <- fread(file_path, select = c("CO_NAME", "building_type", "year_built", "parcel_acres", "total_market_value"))
    
    # Perform aggregation
    dt[year_built > 1900 & parcel_acres > 0.01, 
       .(Parcel_Count = .N,
         Total_Acres = sum(parcel_acres, na.rm = TRUE),
         # Cast to numeric to avoid integer overflow warning
         Value_Per_Acre = sum(as.numeric(total_market_value), na.rm = TRUE) / sum(parcel_acres, na.rm = TRUE)), 
       by = .(CO_NAME, building_type)]
  },
  
  # --- System 3: polars ---
  polars = {
    # We define the plan first
    lf <- pl$scan_csv(file_path)
    
    # Filter
    lf <- lf$filter(
      (pl$col("year_built") > 1900) & (pl$col("parcel_acres") > 0.01)
    )
    
    # Group and Agg
    # Note: explicitly constructing the list of expressions
    lf <- lf$group_by("CO_NAME", "building_type")$agg(
      pl$len()$alias("Parcel_Count"),
      pl$col("parcel_acres")$sum()$alias("Total_Acres"),
      (pl$col("total_market_value")$sum() / pl$col("parcel_acres")$sum())$alias("Value_Per_Acre")
    )
    
    # Collect
    lf$collect()
  },
  
  # --- System 4: duckdb ---
  duckdb = {
    # Define query template
    base_query <- "
      SELECT 
        CO_NAME, 
        building_type, 
        COUNT(*) as Parcel_Count, 
        SUM(parcel_acres) as Total_Acres,
        SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre
      FROM 'file_path_placeholder'
      WHERE year_built > 1900 AND parcel_acres > 0.01
      GROUP BY CO_NAME, building_type
    "
    # Inject path
    final_query <- sub("file_path_placeholder", file_path, base_query)
    
    # Execute
    dbGetQuery(con, final_query)
  },
  
  iterations = 5,
  check = FALSE 
 )

 # 4. Clean up
 dbDisconnect(con)

 # 5. Output
 print(results_agg)
 plot(results_agg)
diff --git a/remm_count_rows.R b/remm_count_rows.R
 library(bench)
 library(dplyr)
 library(readr)
 library(data.table)
 library(polars)
 library(duckdb)

 # 1. Define path (Using forward slashes to avoid escape character issues)
 file_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv"

 # 2. Setup DuckDB connection (needed for the query)
 con <- dbConnect(duckdb())

 # 3. Run Benchmark
 # check = FALSE is used because the return types differ (integer vs double vs data.frame)
 results <- bench::mark(
  
  # System 1: dplyr / readr
  # Note: Standard read_csv reads the whole file into memory. 
  # This will likely be the slowest and most memory-intensive.
  dplyr = {
    read_csv(file_path, show_col_types = FALSE, progress = FALSE) |> 
      nrow()
  },
  
  # System 2: data.table
  # fread is highly optimized for CSV reading.
  data.table = {
    fread(file_path) |> 
      nrow()
  },
  
  # System 3: polars
  # Uses lazy execution (scan_csv) to count without loading everything.
  polars = {
    pl$scan_csv(file_path)$select(pl$len())$collect()
  },
  
  # System 4: duckdb
  # SQL query directly on the CSV file (very fast).
  duckdb = {
    dbGetQuery(con, paste0("SELECT count(*) FROM '", file_path, "'"))
  },
  
  iterations = 5, # Low iteration count due to large file size
  check = FALSE
 )

 # 4. Clean up
 dbDisconnect(con)

 # 5. View Results
 print(results)
 plot(results)
	library(bench)
	library(dplyr)
	library(arrow)
	library(polars)
	library(duckdb)

	# 1. Define Paths
	csv_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv"
	parquet_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.parquet"

	# 2. Convert CSV to Parquet (One-time setup)
	if (!file.exists(parquet_path)) {
	message("Converting CSV to Parquet... this may take a minute.")
	df <- arrow::read_csv_arrow(csv_path)
	arrow::write_parquet(df, parquet_path)
	message("Conversion complete: ", parquet_path)
	}

	# 3. DuckDB Connection
	con <- dbConnect(duckdb())

	# 4. Define the Polars Function (Fixed)
	# We use standard object$method() syntax instead of piping into non-existent pl$ functions
	do_polars_agg <- function(lazy_frame) {
	# Apply filter on the object 'lazy_frame'
	lf <- lazy_frame$filter(
	(pl$col("year_built") > 1900) & (pl$col("parcel_acres") > 0.01)
	)

	# Apply groupby and aggregation
	lf <- lf$group_by("CO_NAME", "building_type")$agg(
	pl$len()$alias("Parcel_Count"),
	pl$col("parcel_acres")$sum()$alias("Total_Acres"),
	(pl$col("total_market_value")$sum() / pl$col("parcel_acres")$sum())$alias("Value_Per_Acre")
	)

	# Execute
	lf$collect()
	}

	# 5. Run Benchmark
	results_io <- bench::mark(

	# --- System 1: Polars (CSV) ---
	polars_csv = {
	# Scan CSV -> Pass to function
	pl$scan_csv(csv_path) \|> do_polars_agg()
	},

	# --- System 2: Polars (Parquet) ---
	polars_parquet = {
	# Scan Parquet -> Pass to function
	pl$scan_parquet(parquet_path) \|> do_polars_agg()
	},

	# --- System 3: DuckDB (CSV) ---
	duckdb_csv = {
	query <- "
	SELECT
	CO_NAME,
	building_type,
	COUNT(*) as Parcel_Count,
	SUM(parcel_acres) as Total_Acres,
	SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre
	FROM 'file_path_placeholder'
	WHERE year_built > 1900 AND parcel_acres > 0.01
	GROUP BY CO_NAME, building_type
	"
	dbGetQuery(con, sub("file_path_placeholder", csv_path, query))
	},

	# --- System 4: DuckDB (Parquet) ---
	duckdb_parquet = {
	query <- "
	SELECT
	CO_NAME,
	building_type,
	COUNT(*) as Parcel_Count,
	SUM(parcel_acres) as Total_Acres,
	SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre
	FROM read_parquet('file_path_placeholder')
	WHERE year_built > 1900 AND parcel_acres > 0.01
	GROUP BY CO_NAME, building_type
	"
	dbGetQuery(con, sub("file_path_placeholder", parquet_path, query))
	},

	iterations = 10,
	check = FALSE
	)

	# 6. Clean up
	dbDisconnect(con)

	# 7. Output
	print(results_io)
	plot(results_io)

	# 8. File Size Comparison
	cat("CSV Size: ", round(file.size(csv_path) / 1e6, 2), "MB\n")
	cat("Parquet Size: ", round(file.size(parquet_path) / 1e6, 2), "MB\n")
	library(bench)
	library(dplyr)
	library(readr)
	library(data.table)
	library(polars)
	library(duckdb)

	# 1. Path
	file_path <- "C:/Users/Pukar.Bhandari/Downloads/REMM_Parcels_2023.csv"

	# 2. DuckDB Connection
	con <- dbConnect(duckdb())

	# 3. Benchmark
	results_agg <- bench::mark(

	# --- System 1: dplyr ---
	dplyr = {
	read_csv(file_path, show_col_types = FALSE, progress = FALSE) \|>
	filter(year_built > 1900, parcel_acres > 0.01) \|>
	group_by(CO_NAME, building_type) \|>
	summarise(
	Parcel_Count = n(),
	Total_Acres = sum(parcel_acres, na.rm = TRUE),
	Value_Per_Acre = sum(total_market_value, na.rm = TRUE) / sum(parcel_acres, na.rm = TRUE),
	.groups = "drop"
	)
	},

	# --- System 2: data.table ---
	data.table = {
	# Read only necessary columns
	dt <- fread(file_path, select = c("CO_NAME", "building_type", "year_built", "parcel_acres", "total_market_value"))

	# Perform aggregation
	dt[year_built > 1900 & parcel_acres > 0.01,
	.(Parcel_Count = .N,
	Total_Acres = sum(parcel_acres, na.rm = TRUE),
	# Cast to numeric to avoid integer overflow warning
	Value_Per_Acre = sum(as.numeric(total_market_value), na.rm = TRUE) / sum(parcel_acres, na.rm = TRUE)),
	by = .(CO_NAME, building_type)]
	},

	# --- System 3: polars ---
	polars = {
	# We define the plan first
	lf <- pl$scan_csv(file_path)

	# Filter
	lf <- lf$filter(
	(pl$col("year_built") > 1900) & (pl$col("parcel_acres") > 0.01)
	)

	# Group and Agg
	# Note: explicitly constructing the list of expressions
	lf <- lf$group_by("CO_NAME", "building_type")$agg(
	pl$len()$alias("Parcel_Count"),
	pl$col("parcel_acres")$sum()$alias("Total_Acres"),
	(pl$col("total_market_value")$sum() / pl$col("parcel_acres")$sum())$alias("Value_Per_Acre")
	)

	# Collect
	lf$collect()
	},

	# --- System 4: duckdb ---
	duckdb = {
	# Define query template
	base_query <- "
	SELECT
	CO_NAME,
	building_type,
	COUNT(*) as Parcel_Count,
	SUM(parcel_acres) as Total_Acres,
	SUM(total_market_value) / SUM(parcel_acres) as Value_Per_Acre
	FROM 'file_path_placeholder'
	WHERE year_built > 1900 AND parcel_acres > 0.01
	GROUP BY CO_NAME, building_type
	"
	# Inject path
	final_query <- sub("file_path_placeholder", file_path, base_query)

	# Execute
	dbGetQuery(con, final_query)
	},

	iterations = 5,
	check = FALSE
	)

	# 4. Clean up
	dbDisconnect(con)

	# 5. Output
	print(results_agg)
	plot(results_agg)