ar-puuk · January 18, 2026 06:52
diff --git a/wftdm-spopt.R b/wftdm-spopt.R
 library(tidyverse)
 library(tidycensus)
 library(mapgl)
 library(spopt)
 library(sf)
 library(spdep)

 options(tigris_use_cache = TRUE)
 project_crs <- "EPSG:26912" # NAD83 / Utah North

 # ----------------------------------------------------------------------------
 # 0. Set up Geography
 # ----------------------------------------------------------------------------

 state_abb = "UT"
 county_names = c("Box Elder", "Weber", "Davis", "Salt Lake", "Utah")

 # Converting state abbreviation code to FIPS code

 state_fips <- tidycensus:::validate_state(state = state_abb)
 county_fips <- vapply(
  county_names,
  function(x) tidycensus:::validate_county(state = state_abb, county = x),
  character(1)
 )

 # Converting County Names to FIPS code
 fips_codes <- paste(state_fips, county_fips, sep = "")
 fips_codes

 # ----------------------------------------------------------------------------
 # 1. FETCH DATA AT THREE LEVELS
 # ----------------------------------------------------------------------------

 # --- LEVEL A: TRACT (Education) ---
 # Education is only valid at Tract level
 print("Fetching Tract Data (Education)...")
 df_tract <- get_acs(
  geography = "tract",
  variables = c(pop_25 = "B15003_001", bach = "B15003_022", mast = "B15003_023", prof = "B15003_024", phd = "B15003_025"),
  state = state_fips,
  county = county_fips,
  year = 2022,
  output = "wide",
  geometry = FALSE
 ) |> 
  transmute(
    TRACT_ID = GEOID,
    # Note the 'E' suffix (Estimate) which tidycensus adds in wide mode
    pct_degree = (bachE + mastE + profE + phdE) / pop_25E
  )

 # --- LEVEL B: BLOCK GROUP (Income & Cars) ---
 # Income/Cars available here
 print("Fetching Block Group Data (Income/Cars)...")
 df_bg <- get_acs(
  geography = "block group",
  variables = c(med_inc = "B19013_001", tot_hh = "B25044_001", own_no_car = "B25044_003", rent_no_car = "B25044_010"),
  state = state_fips,
  county = county_fips,
  year = 2022,
  output = "wide",
  geometry = FALSE
 ) |>
  transmute(
    BG_ID = GEOID,
    bg_med_income = med_incE,
    bg_pct_no_car = (own_no_carE + rent_no_carE) / tot_hhE
  )

 # --- LEVEL C: BLOCK (Geometry & HH Size) ---
 # 2020 Decennial Census for the atomic units
 print("Fetching Block Data (Geometry & HH Size)...")
 df_block <- get_decennial(
  geography = "block",
  variables = c(pop = "P1_001N", hunits = "H1_001N"),
  state = state_fips,
  county = county_fips,
  year = 2020,
  output = "wide",
  sumfile = "dhc",
  geometry = TRUE
 ) |>
  st_transform(project_crs)

 # ----------------------------------------------------------------------------
 # 2. HIERARCHICAL JOIN
 # ----------------------------------------------------------------------------

 # We join attributes down to the block level based on GEOID substrings
 # Block ID (15 digits) -> BG ID (First 12) -> Tract ID (First 11)

 taz_input <- df_block |>
  mutate(
    BG_ID = substr(GEOID, 1, 12),
    TRACT_ID = substr(GEOID, 1, 11),
    # Calculate Block-specific Household Size (Pop / Housing Units)
    # Handle division by zero for empty blocks
    hh_size = ifelse(hunits > 0, pop / hunits, 0)
  ) |>
  # Join BG Data
  left_join(df_bg, by = "BG_ID") |>
  # Join Tract Data
  left_join(df_tract, by = "TRACT_ID") |>
  # Filter: We only want populated residential blocks for TAZs
  filter(
    hunits > 0,
    !is.na(bg_med_income), # Removes blocks in BGs with no data
    !is.na(pct_degree)
  )

 print(paste("Data ready. Total Residential Blocks:", nrow(taz_input)))

 # ----------------------------------------------------------------------------
 # 3. CLEANING SPATIAL ISLANDS
 # ----------------------------------------------------------------------------
 # Because we filtered out empty blocks (roads/parks), we created holes.
 # We must ensure all remaining blocks are connected.

 nb <- spdep::poly2nb(taz_input, queen = TRUE)
 # Identify islands (blocks with 0 neighbors)
 is_island <- (spdep::card(nb) == 0)

 # Filter out islands
 taz_input_clean <- taz_input[!is_island, ]
 print(paste("Removed", sum(is_island), "islands."))

 # ----------------------------------------------------------------------------
 # 4. REGIONALIZATION (SKATER ALGORITHM)
 # ----------------------------------------------------------------------------
 # Using SKATER instead of Max-P. 
 # SKATER is tree-based and much faster for 10k+ polygons.

 # Target: We want approx 3300 TAZs for this county
 num_tazs <- 3300 

 print("Scaling variables...")

 # FIX: Manually scale the variables. 
 # We use 'as.numeric(scale(x))' to ensure they are clean vectors.
 taz_input_ready <- taz_input_clean |>
  mutate(
    hh_size_sc       = as.numeric(scale(hh_size)),
    bg_med_income_sc = as.numeric(scale(bg_med_income)),
    bg_pct_no_car_sc = as.numeric(scale(bg_pct_no_car)),
    pct_degree_sc    = as.numeric(scale(pct_degree))
  )

 print("Running SKATER clustering... (This may take 1-2 minutes)")

 taz_result <- spopt::skater(
  taz_input_ready,
  # Use the Scaled (_sc) variables for the math
  attrs = c("hh_size_sc", "bg_med_income_sc", "bg_pct_no_car_sc", "pct_degree_sc"),
  n_regions = num_tazs,
  bridge_islands = TRUE
 )

 print("Regionalization Complete.")

 # ----------------------------------------------------------------------------
 # 5. VISUALIZATION
 # ----------------------------------------------------------------------------

 # A. Attach the original un-scaled data back to the results for readability
 # The output of skater() retains the columns of the input, 
 # so 'bg_med_income' (the original) is still there.

 # B. Summary Stats
 taz_stats <- taz_result |>
  st_drop_geometry() |>
  group_by(.region) |>
  summarise(
    taz_pop      = sum(pop),
    taz_hh       = sum(hunits),
    avg_income   = mean(bg_med_income, na.rm=TRUE), # Use original dollar amount
    pct_degree   = mean(pct_degree, na.rm=TRUE),    # Use original %
    blocks_count = n()
  )

 print(head(taz_stats))

 # C. Map
 # We drop the border lines (lwd = 0) to make the map render faster
 mapgl::maplibre_view(taz_result, column = ".region", legend = FALSE)
	library(tidyverse)
	library(tidycensus)
	library(mapgl)
	library(spopt)
	library(sf)
	library(spdep)

	options(tigris_use_cache = TRUE)
	project_crs <- "EPSG:26912" # NAD83 / Utah North

	# ----------------------------------------------------------------------------
	# 0. Set up Geography
	# ----------------------------------------------------------------------------

	state_abb = "UT"
	county_names = c("Box Elder", "Weber", "Davis", "Salt Lake", "Utah")

	# Converting state abbreviation code to FIPS code

	state_fips <- tidycensus:::validate_state(state = state_abb)
	county_fips <- vapply(
	county_names,
	function(x) tidycensus:::validate_county(state = state_abb, county = x),
	character(1)
	)

	# Converting County Names to FIPS code
	fips_codes <- paste(state_fips, county_fips, sep = "")
	fips_codes

	# ----------------------------------------------------------------------------
	# 1. FETCH DATA AT THREE LEVELS
	# ----------------------------------------------------------------------------

	# --- LEVEL A: TRACT (Education) ---
	# Education is only valid at Tract level
	print("Fetching Tract Data (Education)...")
	df_tract <- get_acs(
	geography = "tract",
	variables = c(pop_25 = "B15003_001", bach = "B15003_022", mast = "B15003_023", prof = "B15003_024", phd = "B15003_025"),
	state = state_fips,
	county = county_fips,
	year = 2022,
	output = "wide",
	geometry = FALSE
	) \|>
	transmute(
	TRACT_ID = GEOID,
	# Note the 'E' suffix (Estimate) which tidycensus adds in wide mode
	pct_degree = (bachE + mastE + profE + phdE) / pop_25E
	)

	# --- LEVEL B: BLOCK GROUP (Income & Cars) ---
	# Income/Cars available here
	print("Fetching Block Group Data (Income/Cars)...")
	df_bg <- get_acs(
	geography = "block group",
	variables = c(med_inc = "B19013_001", tot_hh = "B25044_001", own_no_car = "B25044_003", rent_no_car = "B25044_010"),
	state = state_fips,
	county = county_fips,
	year = 2022,
	output = "wide",
	geometry = FALSE
	) \|>
	transmute(
	BG_ID = GEOID,
	bg_med_income = med_incE,
	bg_pct_no_car = (own_no_carE + rent_no_carE) / tot_hhE
	)

	# --- LEVEL C: BLOCK (Geometry & HH Size) ---
	# 2020 Decennial Census for the atomic units
	print("Fetching Block Data (Geometry & HH Size)...")
	df_block <- get_decennial(
	geography = "block",
	variables = c(pop = "P1_001N", hunits = "H1_001N"),
	state = state_fips,
	county = county_fips,
	year = 2020,
	output = "wide",
	sumfile = "dhc",
	geometry = TRUE
	) \|>
	st_transform(project_crs)

	# ----------------------------------------------------------------------------
	# 2. HIERARCHICAL JOIN
	# ----------------------------------------------------------------------------

	# We join attributes down to the block level based on GEOID substrings
	# Block ID (15 digits) -> BG ID (First 12) -> Tract ID (First 11)

	taz_input <- df_block \|>
	mutate(
	BG_ID = substr(GEOID, 1, 12),
	TRACT_ID = substr(GEOID, 1, 11),
	# Calculate Block-specific Household Size (Pop / Housing Units)
	# Handle division by zero for empty blocks
	hh_size = ifelse(hunits > 0, pop / hunits, 0)
	) \|>
	# Join BG Data
	left_join(df_bg, by = "BG_ID") \|>
	# Join Tract Data
	left_join(df_tract, by = "TRACT_ID") \|>
	# Filter: We only want populated residential blocks for TAZs
	filter(
	hunits > 0,
	!is.na(bg_med_income), # Removes blocks in BGs with no data
	!is.na(pct_degree)
	)

	print(paste("Data ready. Total Residential Blocks:", nrow(taz_input)))

	# ----------------------------------------------------------------------------
	# 3. CLEANING SPATIAL ISLANDS
	# ----------------------------------------------------------------------------
	# Because we filtered out empty blocks (roads/parks), we created holes.
	# We must ensure all remaining blocks are connected.

	nb <- spdep::poly2nb(taz_input, queen = TRUE)
	# Identify islands (blocks with 0 neighbors)
	is_island <- (spdep::card(nb) == 0)

	# Filter out islands
	taz_input_clean <- taz_input[!is_island, ]
	print(paste("Removed", sum(is_island), "islands."))

	# ----------------------------------------------------------------------------
	# 4. REGIONALIZATION (SKATER ALGORITHM)
	# ----------------------------------------------------------------------------
	# Using SKATER instead of Max-P.
	# SKATER is tree-based and much faster for 10k+ polygons.

	# Target: We want approx 3300 TAZs for this county
	num_tazs <- 3300

	print("Scaling variables...")

	# FIX: Manually scale the variables.
	# We use 'as.numeric(scale(x))' to ensure they are clean vectors.
	taz_input_ready <- taz_input_clean \|>
	mutate(
	hh_size_sc = as.numeric(scale(hh_size)),
	bg_med_income_sc = as.numeric(scale(bg_med_income)),
	bg_pct_no_car_sc = as.numeric(scale(bg_pct_no_car)),
	pct_degree_sc = as.numeric(scale(pct_degree))
	)

	print("Running SKATER clustering... (This may take 1-2 minutes)")

	taz_result <- spopt::skater(
	taz_input_ready,
	# Use the Scaled (_sc) variables for the math
	attrs = c("hh_size_sc", "bg_med_income_sc", "bg_pct_no_car_sc", "pct_degree_sc"),
	n_regions = num_tazs,
	bridge_islands = TRUE
	)

	print("Regionalization Complete.")

	# ----------------------------------------------------------------------------
	# 5. VISUALIZATION
	# ----------------------------------------------------------------------------

	# A. Attach the original un-scaled data back to the results for readability
	# The output of skater() retains the columns of the input,
	# so 'bg_med_income' (the original) is still there.

	# B. Summary Stats
	taz_stats <- taz_result \|>
	st_drop_geometry() \|>
	group_by(.region) \|>
	summarise(
	taz_pop = sum(pop),
	taz_hh = sum(hunits),
	avg_income = mean(bg_med_income, na.rm=TRUE), # Use original dollar amount
	pct_degree = mean(pct_degree, na.rm=TRUE), # Use original %
	blocks_count = n()
	)

	print(head(taz_stats))

	# C. Map
	# We drop the border lines (lwd = 0) to make the map render faster
	mapgl::maplibre_view(taz_result, column = ".region", legend = FALSE)
No results found