Last active
January 2, 2026 00:52
-
-
Save ar-puuk/d278e04d32529f8bf85cb8073578eb32 to your computer and use it in GitHub Desktop.
An R Script to mirror ArcGIS Rest Service using sitemap
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # install.packages(c("sf", "arcgislayers", "xml2", "dplyr", "stringr", "purrr", "httr2")) | |
| library(sf) | |
| library(arcgislayers) | |
| library(xml2) | |
| library(dplyr) | |
| library(stringr) | |
| library(purrr) | |
| library(httr2) | |
| # Configuration | |
| sitemap_url <- "https://nces.ed.gov/opengis/rest/services/?f=sitemap" | |
| output_root <- "NCES_ArcGIS_Mirror" | |
| if (!dir.exists(output_root)) dir.create(output_root) | |
| # 1. Parse Sitemap | |
| cat("Accessing NCES Sitemap...\n") | |
| xml_data <- read_xml(sitemap_url) | |
| ns <- xml_ns(xml_data) | |
| service_urls <- xml_find_all(xml_data, ".//d1:loc", ns) |> | |
| xml_text() |> | |
| str_split_fixed("\\?", 2) |> | |
| (\(x) x[, 1])() |> | |
| unique() |> | |
| keep(\(x) str_detect(x, "MapServer|FeatureServer")) | |
| # 2. Iterate and Download | |
| for (service_url in service_urls) { | |
| tryCatch( | |
| { | |
| # --- 1. Define Folder and GPKG Name --- | |
| clean_path <- str_extract(service_url, "(?<=/services/).*") | |
| logical_path <- str_remove(clean_path, "/(MapServer|FeatureServer)$") | |
| folder_name <- dirname(logical_path) | |
| gpkg_name <- basename(logical_path) | |
| if (folder_name == ".") folder_name <- "General" | |
| # Create Directories | |
| target_dir <- file.path(output_root, folder_name) | |
| if (!dir.exists(target_dir)) dir.create(target_dir, recursive = TRUE) | |
| meta_dir <- file.path(target_dir, "metadata") | |
| if (!dir.exists(meta_dir)) dir.create(meta_dir, recursive = TRUE) | |
| gpkg_path <- file.path(target_dir, paste0(gpkg_name, ".gpkg")) | |
| cat(sprintf("\n--- Service: %s/%s ---\n", folder_name, gpkg_name)) | |
| # --- 2. Connect to Service --- | |
| svc <- arc_open(service_url) | |
| layers_tbl <- list_items(svc) | |
| # FILTER: Only process Feature Layers and Tables | |
| # This fixes the "subscript out of bounds" error caused by Group Layers (ID 0) | |
| layers_to_dl <- layers_tbl |> | |
| filter(type %in% c("Feature Layer", "Table")) | |
| if (nrow(layers_to_dl) == 0) { | |
| cat(" [SKIP] Service contains no queryable layers.\n") | |
| next | |
| } | |
| # --- 3. Iterate Layers --- | |
| for (i in seq_len(nrow(layers_to_dl))) { | |
| layer_id <- layers_to_dl$id[i] | |
| layer_proper_name <- layers_to_dl$name[i] | |
| safe_layer_name <- str_replace_all(layer_proper_name, "[\\s\\.]", "_") | |
| cat(sprintf(" [LAYER] Found ID %s: %s", layer_id, safe_layer_name)) | |
| # A. Download Metadata (Silent) | |
| try( | |
| { | |
| meta_url <- paste0(service_url, "/", layer_id, "/metadata") | |
| meta_file <- file.path(meta_dir, paste0(gpkg_name, "__", safe_layer_name, ".xml")) | |
| request(meta_url) |> | |
| req_perform() |> | |
| resp_body_raw() |> | |
| writeBin(meta_file) | |
| }, | |
| silent = TRUE | |
| ) | |
| # B. Download Data (With Forced Pagination) | |
| layer_obj <- get_layer(svc, layer_id) | |
| # We force `page_size` to 2000. | |
| # This forces the package to use multiple requests. | |
| # Multiple requests = The package automatically shows the progress bar. | |
| cat(" -> Downloading... \n") | |
| layer_data <- tryCatch( | |
| { | |
| arc_select( | |
| layer_obj, | |
| fields = "*", | |
| page_size = 1000 # <--- THIS restores the progress bar & prevents hanging | |
| ) | |
| }, | |
| error = function(e) { | |
| cat(sprintf("\n -> [ERROR] %s\n", e$message)) | |
| return(NULL) | |
| } | |
| ) | |
| if (!is.null(layer_data) && nrow(layer_data) > 0) { | |
| st_write( | |
| layer_data, | |
| dsn = gpkg_path, | |
| layer = safe_layer_name, | |
| append = FALSE, | |
| quiet = FALSE | |
| ) | |
| cat(sprintf(" -> [SUCCESS] Saved %d features to %s\n", nrow(layer_data), gpkg_name)) | |
| } else if (!is.null(layer_data)) { | |
| cat(" -> [EMPTY] Layer returned 0 features.\n") | |
| } | |
| } | |
| Sys.sleep(1) | |
| }, | |
| error = function(e) { | |
| cat(sprintf(" [ERROR] Critical service failure %s: %s\n", service_url, e$message)) | |
| } | |
| ) | |
| } | |
| cat("\nMirroring complete.\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment