Skip to content

Instantly share code, notes, and snippets.

@amkisko
Created December 16, 2025 07:23
Show Gist options
  • Select an option

  • Save amkisko/a561aeba5fd536152ce90c88f2fc5550 to your computer and use it in GitHub Desktop.

Select an option

Save amkisko/a561aeba5fd536152ce90c88f2fc5550 to your computer and use it in GitHub Desktop.

Revisions

  1. amkisko created this gist Dec 16, 2025.
    362 changes: 362 additions & 0 deletions extract-sentry-api-specs.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,362 @@
    #!/usr/bin/env ruby

    require "fileutils"
    require "nokogiri"
    require "uri"

    BASE_URL = "https://docs.sentry.io"
    API_BASE_URL = "#{BASE_URL}/api"

    OUTPUT_DIR = File.join(File.dirname(__FILE__), "api_spec")
    TEMP_DIR = File.join(OUTPUT_DIR, "raw_html")

    FileUtils.mkdir_p(OUTPUT_DIR)
    FileUtils.mkdir_p(TEMP_DIR)

    def sanitize_filename(url)
    # Extract a safe filename from URL
    parts = url.gsub("https://docs.sentry.io/api/", "").gsub(/\/$/, "").split("/")
    filename = parts.join("_").gsub(/[^a-z0-9_-]/, "_")
    # Handle root/blank page - use "index"
    filename = "index" if filename.empty? || filename == "_"
    filename
    end

    def fetch_with_curl(url)
    filename = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
    puts "Fetching: #{url}"

    begin
    require "open-uri"
    require "openssl"
    html = URI.open(url,
    "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    :ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE).read
    File.write(filename, html)
    puts " Saved raw HTML to: #{filename}"
    html
    rescue => e
    puts " Error fetching #{url}: #{e.message}"
    # Try with curl as fallback (with -k to ignore SSL)
    if system("curl", "-k", "-s", "-L", "-H", "User-Agent: Mozilla/5.0", url, "-o", filename)
    if File.exist?(filename) && File.size(filename) > 0
    puts " Saved raw HTML to: #{filename} (via curl)"
    File.read(filename)
    end
    end
    end
    end

    def extract_api_urls_from_sidebar(html)
    # Force UTF-8 encoding
    html = html.force_encoding("UTF-8")

    # Extract body HTML and remove script/style/svg
    body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)
    return [] unless body_match

    body_html = body_match[1]
    body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "")
    body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "")
    body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "")
    body_html = body_html.gsub(/<svg[^>]*\/>/i, "")

    # Parse the cleaned body HTML
    doc = Nokogiri::HTML("<body>#{body_html}</body>")
    body = doc.css("body").first
    return [] unless body

    urls = []

    # Find the "Sentry API" sidebar section
    # Look for a link with href="/api/" that contains "Sentry API" text
    api_section = body.xpath(".//a[contains(@href, '/api/') and contains(., 'Sentry API')]").first ||
    body.xpath(".//a[@href='/api/']").first

    return [] unless api_section

    # Find the parent <li> with data-sidebar-branch
    api_li = api_section.ancestors("li[data-sidebar-branch='true']").first ||
    api_section.parent

    return [] unless api_li

    # Find the nested <ul> with data-sidebar-tree inside this section
    api_ul = api_li.css("ul[data-sidebar-tree]").first

    if api_ul
    # Recursively extract all links from this section
    extract_links_recursive(api_ul, urls)
    end

    # Also include the main /api/ link itself
    urls << "/api/" unless urls.include?("/api/")

    # Convert relative URLs to absolute and normalize
    urls.map do |url|
    # Normalize: ensure it starts with /api/ and ends with /
    url = url.gsub(/\/$/, "") + "/" unless url.end_with?("/")
    url = "/api/" + url unless url.start_with?("/")

    if url.start_with?("/")
    "#{BASE_URL}#{url}"
    elsif url.start_with?("http")
    url
    else
    "#{API_BASE_URL}/#{url}"
    end
    end.uniq.sort
    end

    def extract_links_recursive(node, urls)
    # Extract all <a> tags with href attributes in this node and its children
    node.css("a[href]").each do |link|
    href = link["href"]
    next unless href

    # Only include API links (starting with /api/)
    if href.start_with?("/api/")
    # Normalize: remove trailing slash then add it back for consistency
    href = href.gsub(/\/$/, "") + "/" unless href.end_with?("/")
    urls << href unless urls.include?(href)
    end
    end

    # Recursively process nested lists (including those in CollapsibleSidebarLink components)
    node.css("ul[data-sidebar-tree], ul").each do |ul|
    extract_links_recursive(ul, urls)
    end

    # Also check for links in CollapsibleSidebarLink components (they might have nested children)
    node.css("li[data-sidebar-branch]").each do |li|
    extract_links_recursive(li, urls)
    end
    end

    def extract_body(html)
    doc = Nokogiri::HTML(html)

    # Remove script, style, and svg tags from the entire document
    doc.css("script, style, svg").remove

    # Extract body content
    body = doc.css("body").first
    body ? body.to_html : html
    end

    def purify_content(html, url)
    # Force UTF-8 encoding
    html = html.force_encoding("UTF-8")

    # First, extract body HTML as string and remove script/style/svg with regex
    # This ensures we work with clean HTML before parsing
    body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)
    return nil unless body_match

    body_html = body_match[1]

    # Remove script, style, and svg tags using regex (before parsing)
    body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "")
    body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "")
    body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "")
    body_html = body_html.gsub(/<svg[^>]*\/>/i, "") # Self-closing svg tags

    # Now parse the cleaned body HTML
    doc = Nokogiri::HTML("<body>#{body_html}</body>")
    body = doc.css("body").first
    return nil unless body

    # Find main content - try multiple selectors
    main_content = body.xpath(".//*[@id='doc-content']").first ||
    body.css("#doc-content").first ||
    body.css("main #main").first ||
    body.css("article, main, .content, #content, .documentation-content, .markdown-body").first

    unless main_content
    return nil
    end

    # Remove navigation, breadcrumbs, and other non-content elements
    main_content.css(".breadcrumbs, .not-prose, nav, aside, .grid, header, footer").remove

    # Get the content div
    content_div = if main_content.css("#main").any?
    main_content.css("#main").first
    else
    main_content
    end

    unless content_div
    return nil
    end

    spec = []

    # Extract title from h1 or hgroup
    title_elem = main_content.css("h1").first || main_content.css("hgroup h1").first
    if title_elem
    title = title_elem.text.strip
    spec << "# #{title}\n\n" unless title.empty?
    end

    # Get all content elements in document order from #main or content_div
    main_section = content_div.css("#main").first || content_div

    # Get all headings, paragraphs, code blocks, and tables in order
    all_elements = main_section.css("h1, h2, h3, h4, h5, h6, p, pre, table, ul, ol, div.api-block, div.api-info-row, dl.api-params")

    all_elements.each do |node|
    next if node["class"]&.include?("breadcrumb") || node["class"]&.include?("not-prose")

    case node.name
    when "h1"
    text = node.text.strip
    spec << "# #{text}\n\n" unless text.empty?
    when "h2"
    node.css("a, svg").remove
    text = node.text.strip
    spec << "## #{text}\n\n" unless text.empty?
    when "h3"
    node.css("a, svg").remove
    text = node.text.strip
    spec << "### #{text}\n\n" unless text.empty?
    when "h4"
    node.css("a, svg").remove
    text = node.text.strip
    spec << "#### #{text}\n\n" unless text.empty?
    when "p"
    text = node.text.strip
    spec << "#{text}\n\n" if text.length > 10
    when "pre"
    code_elem = node.css("code").first || node
    code_text = code_elem.text.strip
    if code_text.length > 0
    lang = code_elem["class"]&.match(/language-(\w+)/)&.[](1) || ""
    spec << "```#{lang}\n#{code_text}\n```\n\n"
    end
    when "table"
    rows = node.css("tr").map do |tr|
    tr.css("td, th").map { |cell| cell.text.strip.gsub(/\s+/, " ") }
    end
    if rows.any? && rows.first.any?
    spec << "| " + rows.first.join(" | ") + " |\n"
    spec << "| " + rows.first.map { "---" }.join(" | ") + " |\n"
    rows[1..-1].each do |row|
    spec << "| " + row.join(" | ") + " |\n" if row.any?
    end
    spec << "\n"
    end
    when "ul", "ol"
    node.css("li").each do |li|
    text = li.text.strip
    spec << "- #{text}\n" if text.length > 0
    end
    spec << "\n" if node.css("li").any?
    when "div"
    # Handle API blocks and info rows
    if node["class"]&.include?("api-block")
    # Extract HTTP method and endpoint
    verb = node.css(".api-request-block-verb, .api-block-header").first&.text&.strip
    endpoint = node.css("span").map(&:text).join(" ").strip
    if verb && endpoint
    spec << "## Endpoint\n\n```\n#{verb} #{endpoint}\n```\n\n"
    end
    elsif node["class"]&.include?("api-info-row")
    # Extract parameter information
    heading = node.css("h3").first&.text&.strip
    if heading
    spec << "### #{heading}\n\n"
    node.css("dl.api-params dt").each do |dt|
    param_name = dt.css("code").first&.text&.strip
    param_type = dt.css("em").first&.text&.strip
    required = dt.css(".required").first ? "REQUIRED" : "OPTIONAL"
    dd = dt.next_element
    description = dd&.css("p")&.first&.text&.strip || dd&.text&.strip
    if param_name
    spec << "- **#{param_name}** (#{param_type}) - #{required}\n"
    spec << " #{description}\n\n" if description && description.length > 0
    end
    end
    end
    end
    end
    end

    # Clean up excessive newlines
    result = spec.join.gsub(/\n{3,}/, "\n\n").strip

    (result.length > 50) ? result : nil
    end

    # Step 0: Fetch main API page and extract all URLs from sidebar
    puts "Step 0: Fetching main API page and extracting URLs from sidebar..."
    main_api_html = fetch_with_curl("#{API_BASE_URL}/")
    unless main_api_html
    puts "Error: Could not fetch main API page. Exiting."
    exit 1
    end

    # Extract top-level category URLs
    category_urls = extract_api_urls_from_sidebar(main_api_html)
    puts "Found #{category_urls.count} top-level category URLs"

    # Step 0.5: For each category page, fetch it and extract nested endpoint URLs
    puts "\nStep 0.5: Fetching category pages to extract nested endpoint URLs..."
    all_urls = Set.new(category_urls)

    category_urls.each do |category_url|
    next if category_url == "#{API_BASE_URL}/" # Skip the main page, we already have it

    puts " Fetching category: #{category_url}"
    category_html = fetch_with_curl(category_url)
    next unless category_html

    nested_urls = extract_api_urls_from_sidebar(category_html)
    nested_urls.each { |url| all_urls.add(url) }
    puts " Found #{nested_urls.count} URLs in #{category_url}"
    end

    URLS = all_urls.to_a.sort
    puts "\nTotal unique API URLs found: #{URLS.count}"
    puts "Sample URLs:"
    URLS.first(10).each { |url| puts " - #{url}" }
    puts " ..." if URLS.count > 10

    if URLS.empty?
    puts "Warning: No URLs found in sidebar. Exiting."
    exit 1
    end

    # Step 1: Fetch all HTML files with curl
    puts "\nStep 1: Fetching HTML files..."
    URLS.each do |url|
    fetch_with_curl(url)
    end

    # Step 2: Extract body and purify
    puts "\nStep 2: Extracting and purifying content..."
    URLS.each do |url|
    html_file = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
    next unless File.exist?(html_file)

    html = File.read(html_file)
    spec = purify_content(html, url)

    if spec && !spec.strip.empty?
    filename = File.join(OUTPUT_DIR, "#{sanitize_filename(url)}.md")
    File.write(filename, spec)
    puts " Extracted spec: #{File.basename(filename)}"
    else
    puts " Warning: Could not extract spec from #{url}"
    end
    end

    puts "\nDone! Specs saved to #{OUTPUT_DIR}"
    puts "Total URLs processed: #{URLS.count}"

    # Clean up: Remove raw_html folder
    puts "\nCleaning up raw HTML files..."
    if File.directory?(TEMP_DIR)
    FileUtils.rm_rf(TEMP_DIR)
    puts "Removed #{TEMP_DIR}"
    end