Skip to content

Instantly share code, notes, and snippets.

@ar-puuk
Last active January 2, 2026 00:52
Show Gist options
  • Select an option

  • Save ar-puuk/d278e04d32529f8bf85cb8073578eb32 to your computer and use it in GitHub Desktop.

Select an option

Save ar-puuk/d278e04d32529f8bf85cb8073578eb32 to your computer and use it in GitHub Desktop.
An R Script to mirror ArcGIS Rest Service using sitemap
# install.packages(c("sf", "arcgislayers", "xml2", "dplyr", "stringr", "purrr", "httr2"))
library(sf)
library(arcgislayers)
library(xml2)
library(dplyr)
library(stringr)
library(purrr)
library(httr2)
# Configuration
sitemap_url <- "https://nces.ed.gov/opengis/rest/services/?f=sitemap"
output_root <- "NCES_ArcGIS_Mirror"
if (!dir.exists(output_root)) dir.create(output_root)
# 1. Parse Sitemap
cat("Accessing NCES Sitemap...\n")
xml_data <- read_xml(sitemap_url)
ns <- xml_ns(xml_data)
service_urls <- xml_find_all(xml_data, ".//d1:loc", ns) |>
xml_text() |>
str_split_fixed("\\?", 2) |>
(\(x) x[, 1])() |>
unique() |>
keep(\(x) str_detect(x, "MapServer|FeatureServer"))
# 2. Iterate and Download
for (service_url in service_urls) {
tryCatch(
{
# --- 1. Define Folder and GPKG Name ---
clean_path <- str_extract(service_url, "(?<=/services/).*")
logical_path <- str_remove(clean_path, "/(MapServer|FeatureServer)$")
folder_name <- dirname(logical_path)
gpkg_name <- basename(logical_path)
if (folder_name == ".") folder_name <- "General"
# Create Directories
target_dir <- file.path(output_root, folder_name)
if (!dir.exists(target_dir)) dir.create(target_dir, recursive = TRUE)
meta_dir <- file.path(target_dir, "metadata")
if (!dir.exists(meta_dir)) dir.create(meta_dir, recursive = TRUE)
gpkg_path <- file.path(target_dir, paste0(gpkg_name, ".gpkg"))
cat(sprintf("\n--- Service: %s/%s ---\n", folder_name, gpkg_name))
# --- 2. Connect to Service ---
svc <- arc_open(service_url)
layers_tbl <- list_items(svc)
# FILTER: Only process Feature Layers and Tables
# This fixes the "subscript out of bounds" error caused by Group Layers (ID 0)
layers_to_dl <- layers_tbl |>
filter(type %in% c("Feature Layer", "Table"))
if (nrow(layers_to_dl) == 0) {
cat(" [SKIP] Service contains no queryable layers.\n")
next
}
# --- 3. Iterate Layers ---
for (i in seq_len(nrow(layers_to_dl))) {
layer_id <- layers_to_dl$id[i]
layer_proper_name <- layers_to_dl$name[i]
safe_layer_name <- str_replace_all(layer_proper_name, "[\\s\\.]", "_")
cat(sprintf(" [LAYER] Found ID %s: %s", layer_id, safe_layer_name))
# A. Download Metadata (Silent)
try(
{
meta_url <- paste0(service_url, "/", layer_id, "/metadata")
meta_file <- file.path(meta_dir, paste0(gpkg_name, "__", safe_layer_name, ".xml"))
request(meta_url) |>
req_perform() |>
resp_body_raw() |>
writeBin(meta_file)
},
silent = TRUE
)
# B. Download Data (With Forced Pagination)
layer_obj <- get_layer(svc, layer_id)
# We force `page_size` to 2000.
# This forces the package to use multiple requests.
# Multiple requests = The package automatically shows the progress bar.
cat(" -> Downloading... \n")
layer_data <- tryCatch(
{
arc_select(
layer_obj,
fields = "*",
page_size = 1000 # <--- THIS restores the progress bar & prevents hanging
)
},
error = function(e) {
cat(sprintf("\n -> [ERROR] %s\n", e$message))
return(NULL)
}
)
if (!is.null(layer_data) && nrow(layer_data) > 0) {
st_write(
layer_data,
dsn = gpkg_path,
layer = safe_layer_name,
append = FALSE,
quiet = FALSE
)
cat(sprintf(" -> [SUCCESS] Saved %d features to %s\n", nrow(layer_data), gpkg_name))
} else if (!is.null(layer_data)) {
cat(" -> [EMPTY] Layer returned 0 features.\n")
}
}
Sys.sleep(1)
},
error = function(e) {
cat(sprintf(" [ERROR] Critical service failure %s: %s\n", service_url, e$message))
}
)
}
cat("\nMirroring complete.\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment