amkisko · December 16, 2025 07:23 · Dec 16, 2025
diff --git a/extract-sentry-api-specs.rb b/extract-sentry-api-specs.rb
@@ -0,0 +1,362 @@
+#!/usr/bin/env ruby
+
+require "fileutils"
+require "nokogiri"
+require "uri"
+
+BASE_URL = "https://docs.sentry.io"
+API_BASE_URL = "#{BASE_URL}/api"
+
+OUTPUT_DIR = File.join(File.dirname(__FILE__), "api_spec")
+TEMP_DIR = File.join(OUTPUT_DIR, "raw_html")
+
+FileUtils.mkdir_p(OUTPUT_DIR)
+FileUtils.mkdir_p(TEMP_DIR)
+
+def sanitize_filename(url)
+  # Extract a safe filename from URL
+  parts = url.gsub("https://docs.sentry.io/api/", "").gsub(/\/$/, "").split("/")
+  filename = parts.join("_").gsub(/[^a-z0-9_-]/, "_")
+  # Handle root/blank page - use "index"
+  filename = "index" if filename.empty? || filename == "_"
+  filename
+end
+
+def fetch_with_curl(url)
+  filename = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
+  puts "Fetching: #{url}"
+
+  begin
+    require "open-uri"
+    require "openssl"
+    html = URI.open(url,
+      "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
+      :ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE).read
+    File.write(filename, html)
+    puts "  Saved raw HTML to: #{filename}"
+    html
+  rescue => e
+    puts "  Error fetching #{url}: #{e.message}"
+    # Try with curl as fallback (with -k to ignore SSL)
+    if system("curl", "-k", "-s", "-L", "-H", "User-Agent: Mozilla/5.0", url, "-o", filename)
+      if File.exist?(filename) && File.size(filename) > 0
+        puts "  Saved raw HTML to: #{filename} (via curl)"
+        File.read(filename)
+      end
+    end
+  end
+end
+
+def extract_api_urls_from_sidebar(html)
+  # Force UTF-8 encoding
+  html = html.force_encoding("UTF-8")
+
+  # Extract body HTML and remove script/style/svg
+  body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)
+  return [] unless body_match
+
+  body_html = body_match[1]
+  body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "")
+  body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "")
+  body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "")
+  body_html = body_html.gsub(/<svg[^>]*\/>/i, "")
+
+  # Parse the cleaned body HTML
+  doc = Nokogiri::HTML("<body>#{body_html}</body>")
+  body = doc.css("body").first
+  return [] unless body
+
+  urls = []
+
+  # Find the "Sentry API" sidebar section
+  # Look for a link with href="/api/" that contains "Sentry API" text
+  api_section = body.xpath(".//a[contains(@href, '/api/') and contains(., 'Sentry API')]").first ||
+    body.xpath(".//a[@href='/api/']").first
+
+  return [] unless api_section
+
+  # Find the parent <li> with data-sidebar-branch
+  api_li = api_section.ancestors("li[data-sidebar-branch='true']").first ||
+    api_section.parent
+
+  return [] unless api_li
+
+  # Find the nested <ul> with data-sidebar-tree inside this section
+  api_ul = api_li.css("ul[data-sidebar-tree]").first
+
+  if api_ul
+    # Recursively extract all links from this section
+    extract_links_recursive(api_ul, urls)
+  end
+
+  # Also include the main /api/ link itself
+  urls << "/api/" unless urls.include?("/api/")
+
+  # Convert relative URLs to absolute and normalize
+  urls.map do |url|
+    # Normalize: ensure it starts with /api/ and ends with /
+    url = url.gsub(/\/$/, "") + "/" unless url.end_with?("/")
+    url = "/api/" + url unless url.start_with?("/")
+
+    if url.start_with?("/")
+      "#{BASE_URL}#{url}"
+    elsif url.start_with?("http")
+      url
+    else
+      "#{API_BASE_URL}/#{url}"
+    end
+  end.uniq.sort
+end
+
+def extract_links_recursive(node, urls)
+  # Extract all <a> tags with href attributes in this node and its children
+  node.css("a[href]").each do |link|
+    href = link["href"]
+    next unless href
+
+    # Only include API links (starting with /api/)
+    if href.start_with?("/api/")
+      # Normalize: remove trailing slash then add it back for consistency
+      href = href.gsub(/\/$/, "") + "/" unless href.end_with?("/")
+      urls << href unless urls.include?(href)
+    end
+  end
+
+  # Recursively process nested lists (including those in CollapsibleSidebarLink components)
+  node.css("ul[data-sidebar-tree], ul").each do |ul|
+    extract_links_recursive(ul, urls)
+  end
+
+  # Also check for links in CollapsibleSidebarLink components (they might have nested children)
+  node.css("li[data-sidebar-branch]").each do |li|
+    extract_links_recursive(li, urls)
+  end
+end
+
+def extract_body(html)
+  doc = Nokogiri::HTML(html)
+
+  # Remove script, style, and svg tags from the entire document
+  doc.css("script, style, svg").remove
+
+  # Extract body content
+  body = doc.css("body").first
+  body ? body.to_html : html
+end
+
+def purify_content(html, url)
+  # Force UTF-8 encoding
+  html = html.force_encoding("UTF-8")
+
+  # First, extract body HTML as string and remove script/style/svg with regex
+  # This ensures we work with clean HTML before parsing
+  body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)
+  return nil unless body_match
+
+  body_html = body_match[1]
+
+  # Remove script, style, and svg tags using regex (before parsing)
+  body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "")
+  body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "")
+  body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "")
+  body_html = body_html.gsub(/<svg[^>]*\/>/i, "")  # Self-closing svg tags
+
+  # Now parse the cleaned body HTML
+  doc = Nokogiri::HTML("<body>#{body_html}</body>")
+  body = doc.css("body").first
+  return nil unless body
+
+  # Find main content - try multiple selectors
+  main_content = body.xpath(".//*[@id='doc-content']").first ||
+    body.css("#doc-content").first ||
+    body.css("main #main").first ||
+    body.css("article, main, .content, #content, .documentation-content, .markdown-body").first
+
+  unless main_content
+    return nil
+  end
+
+  # Remove navigation, breadcrumbs, and other non-content elements
+  main_content.css(".breadcrumbs, .not-prose, nav, aside, .grid, header, footer").remove
+
+  # Get the content div
+  content_div = if main_content.css("#main").any?
+    main_content.css("#main").first
+  else
+    main_content
+  end
+
+  unless content_div
+    return nil
+  end
+
+  spec = []
+
+  # Extract title from h1 or hgroup
+  title_elem = main_content.css("h1").first || main_content.css("hgroup h1").first
+  if title_elem
+    title = title_elem.text.strip
+    spec << "# #{title}\n\n" unless title.empty?
+  end
+
+  # Get all content elements in document order from #main or content_div
+  main_section = content_div.css("#main").first || content_div
+
+  # Get all headings, paragraphs, code blocks, and tables in order
+  all_elements = main_section.css("h1, h2, h3, h4, h5, h6, p, pre, table, ul, ol, div.api-block, div.api-info-row, dl.api-params")
+
+  all_elements.each do |node|
+    next if node["class"]&.include?("breadcrumb") || node["class"]&.include?("not-prose")
+
+    case node.name
+    when "h1"
+      text = node.text.strip
+      spec << "# #{text}\n\n" unless text.empty?
+    when "h2"
+      node.css("a, svg").remove
+      text = node.text.strip
+      spec << "## #{text}\n\n" unless text.empty?
+    when "h3"
+      node.css("a, svg").remove
+      text = node.text.strip
+      spec << "### #{text}\n\n" unless text.empty?
+    when "h4"
+      node.css("a, svg").remove
+      text = node.text.strip
+      spec << "#### #{text}\n\n" unless text.empty?
+    when "p"
+      text = node.text.strip
+      spec << "#{text}\n\n" if text.length > 10
+    when "pre"
+      code_elem = node.css("code").first || node
+      code_text = code_elem.text.strip
+      if code_text.length > 0
+        lang = code_elem["class"]&.match(/language-(\w+)/)&.[](1) || ""
+        spec << "```#{lang}\n#{code_text}\n```\n\n"
+      end
+    when "table"
+      rows = node.css("tr").map do |tr|
+        tr.css("td, th").map { |cell| cell.text.strip.gsub(/\s+/, " ") }
+      end
+      if rows.any? && rows.first.any?
+        spec << "| " + rows.first.join(" | ") + " |\n"
+        spec << "| " + rows.first.map { "---" }.join(" | ") + " |\n"
+        rows[1..-1].each do |row|
+          spec << "| " + row.join(" | ") + " |\n" if row.any?
+        end
+        spec << "\n"
+      end
+    when "ul", "ol"
+      node.css("li").each do |li|
+        text = li.text.strip
+        spec << "- #{text}\n" if text.length > 0
+      end
+      spec << "\n" if node.css("li").any?
+    when "div"
+      # Handle API blocks and info rows
+      if node["class"]&.include?("api-block")
+        # Extract HTTP method and endpoint
+        verb = node.css(".api-request-block-verb, .api-block-header").first&.text&.strip
+        endpoint = node.css("span").map(&:text).join(" ").strip
+        if verb && endpoint
+          spec << "## Endpoint\n\n```\n#{verb} #{endpoint}\n```\n\n"
+        end
+      elsif node["class"]&.include?("api-info-row")
+        # Extract parameter information
+        heading = node.css("h3").first&.text&.strip
+        if heading
+          spec << "### #{heading}\n\n"
+          node.css("dl.api-params dt").each do |dt|
+            param_name = dt.css("code").first&.text&.strip
+            param_type = dt.css("em").first&.text&.strip
+            required = dt.css(".required").first ? "REQUIRED" : "OPTIONAL"
+            dd = dt.next_element
+            description = dd&.css("p")&.first&.text&.strip || dd&.text&.strip
+            if param_name
+              spec << "- **#{param_name}** (#{param_type}) - #{required}\n"
+              spec << "  #{description}\n\n" if description && description.length > 0
+            end
+          end
+        end
+      end
+    end
+  end
+
+  # Clean up excessive newlines
+  result = spec.join.gsub(/\n{3,}/, "\n\n").strip
+
+  (result.length > 50) ? result : nil
+end
+
+# Step 0: Fetch main API page and extract all URLs from sidebar
+puts "Step 0: Fetching main API page and extracting URLs from sidebar..."
+main_api_html = fetch_with_curl("#{API_BASE_URL}/")
+unless main_api_html
+  puts "Error: Could not fetch main API page. Exiting."
+  exit 1
+end
+
+# Extract top-level category URLs
+category_urls = extract_api_urls_from_sidebar(main_api_html)
+puts "Found #{category_urls.count} top-level category URLs"
+
+# Step 0.5: For each category page, fetch it and extract nested endpoint URLs
+puts "\nStep 0.5: Fetching category pages to extract nested endpoint URLs..."
+all_urls = Set.new(category_urls)
+
+category_urls.each do |category_url|
+  next if category_url == "#{API_BASE_URL}/"  # Skip the main page, we already have it
+
+  puts "  Fetching category: #{category_url}"
+  category_html = fetch_with_curl(category_url)
+  next unless category_html
+
+  nested_urls = extract_api_urls_from_sidebar(category_html)
+  nested_urls.each { |url| all_urls.add(url) }
+  puts "    Found #{nested_urls.count} URLs in #{category_url}"
+end
+
+URLS = all_urls.to_a.sort
+puts "\nTotal unique API URLs found: #{URLS.count}"
+puts "Sample URLs:"
+URLS.first(10).each { |url| puts "  - #{url}" }
+puts "  ..." if URLS.count > 10
+
+if URLS.empty?
+  puts "Warning: No URLs found in sidebar. Exiting."
+  exit 1
+end
+
+# Step 1: Fetch all HTML files with curl
+puts "\nStep 1: Fetching HTML files..."
+URLS.each do |url|
+  fetch_with_curl(url)
+end
+
+# Step 2: Extract body and purify
+puts "\nStep 2: Extracting and purifying content..."
+URLS.each do |url|
+  html_file = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
+  next unless File.exist?(html_file)
+
+  html = File.read(html_file)
+  spec = purify_content(html, url)
+
+  if spec && !spec.strip.empty?
+    filename = File.join(OUTPUT_DIR, "#{sanitize_filename(url)}.md")
+    File.write(filename, spec)
+    puts "  Extracted spec: #{File.basename(filename)}"
+  else
+    puts "  Warning: Could not extract spec from #{url}"
+  end
+end
+
+puts "\nDone! Specs saved to #{OUTPUT_DIR}"
+puts "Total URLs processed: #{URLS.count}"
+
+# Clean up: Remove raw_html folder
+puts "\nCleaning up raw HTML files..."
+if File.directory?(TEMP_DIR)
+  FileUtils.rm_rf(TEMP_DIR)
+  puts "Removed #{TEMP_DIR}"
+end
No results found