Created
December 16, 2025 07:23
-
-
Save amkisko/a561aeba5fd536152ce90c88f2fc5550 to your computer and use it in GitHub Desktop.
Revisions
-
amkisko created this gist
Dec 16, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,362 @@ #!/usr/bin/env ruby require "fileutils" require "nokogiri" require "uri" BASE_URL = "https://docs.sentry.io" API_BASE_URL = "#{BASE_URL}/api" OUTPUT_DIR = File.join(File.dirname(__FILE__), "api_spec") TEMP_DIR = File.join(OUTPUT_DIR, "raw_html") FileUtils.mkdir_p(OUTPUT_DIR) FileUtils.mkdir_p(TEMP_DIR) def sanitize_filename(url) # Extract a safe filename from URL parts = url.gsub("https://docs.sentry.io/api/", "").gsub(/\/$/, "").split("/") filename = parts.join("_").gsub(/[^a-z0-9_-]/, "_") # Handle root/blank page - use "index" filename = "index" if filename.empty? || filename == "_" filename end def fetch_with_curl(url) filename = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html") puts "Fetching: #{url}" begin require "open-uri" require "openssl" html = URI.open(url, "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", :ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE).read File.write(filename, html) puts " Saved raw HTML to: #{filename}" html rescue => e puts " Error fetching #{url}: #{e.message}" # Try with curl as fallback (with -k to ignore SSL) if system("curl", "-k", "-s", "-L", "-H", "User-Agent: Mozilla/5.0", url, "-o", filename) if File.exist?(filename) && File.size(filename) > 0 puts " Saved raw HTML to: #{filename} (via curl)" File.read(filename) end end end end def extract_api_urls_from_sidebar(html) # Force UTF-8 encoding html = html.force_encoding("UTF-8") # Extract body HTML and remove script/style/svg body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i) return [] unless body_match body_html = body_match[1] body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "") body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "") body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "") body_html = body_html.gsub(/<svg[^>]*\/>/i, "") # Parse the cleaned body HTML doc = Nokogiri::HTML("<body>#{body_html}</body>") body = doc.css("body").first return [] unless body urls = [] # Find the "Sentry API" sidebar section # Look for a link with href="/api/" that contains "Sentry API" text api_section = body.xpath(".//a[contains(@href, '/api/') and contains(., 'Sentry API')]").first || body.xpath(".//a[@href='/api/']").first return [] unless api_section # Find the parent <li> with data-sidebar-branch api_li = api_section.ancestors("li[data-sidebar-branch='true']").first || api_section.parent return [] unless api_li # Find the nested <ul> with data-sidebar-tree inside this section api_ul = api_li.css("ul[data-sidebar-tree]").first if api_ul # Recursively extract all links from this section extract_links_recursive(api_ul, urls) end # Also include the main /api/ link itself urls << "/api/" unless urls.include?("/api/") # Convert relative URLs to absolute and normalize urls.map do |url| # Normalize: ensure it starts with /api/ and ends with / url = url.gsub(/\/$/, "") + "/" unless url.end_with?("/") url = "/api/" + url unless url.start_with?("/") if url.start_with?("/") "#{BASE_URL}#{url}" elsif url.start_with?("http") url else "#{API_BASE_URL}/#{url}" end end.uniq.sort end def extract_links_recursive(node, urls) # Extract all <a> tags with href attributes in this node and its children node.css("a[href]").each do |link| href = link["href"] next unless href # Only include API links (starting with /api/) if href.start_with?("/api/") # Normalize: remove trailing slash then add it back for consistency href = href.gsub(/\/$/, "") + "/" unless href.end_with?("/") urls << href unless urls.include?(href) end end # Recursively process nested lists (including those in CollapsibleSidebarLink components) node.css("ul[data-sidebar-tree], ul").each do |ul| extract_links_recursive(ul, urls) end # Also check for links in CollapsibleSidebarLink components (they might have nested children) node.css("li[data-sidebar-branch]").each do |li| extract_links_recursive(li, urls) end end def extract_body(html) doc = Nokogiri::HTML(html) # Remove script, style, and svg tags from the entire document doc.css("script, style, svg").remove # Extract body content body = doc.css("body").first body ? body.to_html : html end def purify_content(html, url) # Force UTF-8 encoding html = html.force_encoding("UTF-8") # First, extract body HTML as string and remove script/style/svg with regex # This ensures we work with clean HTML before parsing body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i) return nil unless body_match body_html = body_match[1] # Remove script, style, and svg tags using regex (before parsing) body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "") body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "") body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "") body_html = body_html.gsub(/<svg[^>]*\/>/i, "") # Self-closing svg tags # Now parse the cleaned body HTML doc = Nokogiri::HTML("<body>#{body_html}</body>") body = doc.css("body").first return nil unless body # Find main content - try multiple selectors main_content = body.xpath(".//*[@id='doc-content']").first || body.css("#doc-content").first || body.css("main #main").first || body.css("article, main, .content, #content, .documentation-content, .markdown-body").first unless main_content return nil end # Remove navigation, breadcrumbs, and other non-content elements main_content.css(".breadcrumbs, .not-prose, nav, aside, .grid, header, footer").remove # Get the content div content_div = if main_content.css("#main").any? main_content.css("#main").first else main_content end unless content_div return nil end spec = [] # Extract title from h1 or hgroup title_elem = main_content.css("h1").first || main_content.css("hgroup h1").first if title_elem title = title_elem.text.strip spec << "# #{title}\n\n" unless title.empty? end # Get all content elements in document order from #main or content_div main_section = content_div.css("#main").first || content_div # Get all headings, paragraphs, code blocks, and tables in order all_elements = main_section.css("h1, h2, h3, h4, h5, h6, p, pre, table, ul, ol, div.api-block, div.api-info-row, dl.api-params") all_elements.each do |node| next if node["class"]&.include?("breadcrumb") || node["class"]&.include?("not-prose") case node.name when "h1" text = node.text.strip spec << "# #{text}\n\n" unless text.empty? when "h2" node.css("a, svg").remove text = node.text.strip spec << "## #{text}\n\n" unless text.empty? when "h3" node.css("a, svg").remove text = node.text.strip spec << "### #{text}\n\n" unless text.empty? when "h4" node.css("a, svg").remove text = node.text.strip spec << "#### #{text}\n\n" unless text.empty? when "p" text = node.text.strip spec << "#{text}\n\n" if text.length > 10 when "pre" code_elem = node.css("code").first || node code_text = code_elem.text.strip if code_text.length > 0 lang = code_elem["class"]&.match(/language-(\w+)/)&.[](1) || "" spec << "```#{lang}\n#{code_text}\n```\n\n" end when "table" rows = node.css("tr").map do |tr| tr.css("td, th").map { |cell| cell.text.strip.gsub(/\s+/, " ") } end if rows.any? && rows.first.any? spec << "| " + rows.first.join(" | ") + " |\n" spec << "| " + rows.first.map { "---" }.join(" | ") + " |\n" rows[1..-1].each do |row| spec << "| " + row.join(" | ") + " |\n" if row.any? end spec << "\n" end when "ul", "ol" node.css("li").each do |li| text = li.text.strip spec << "- #{text}\n" if text.length > 0 end spec << "\n" if node.css("li").any? when "div" # Handle API blocks and info rows if node["class"]&.include?("api-block") # Extract HTTP method and endpoint verb = node.css(".api-request-block-verb, .api-block-header").first&.text&.strip endpoint = node.css("span").map(&:text).join(" ").strip if verb && endpoint spec << "## Endpoint\n\n```\n#{verb} #{endpoint}\n```\n\n" end elsif node["class"]&.include?("api-info-row") # Extract parameter information heading = node.css("h3").first&.text&.strip if heading spec << "### #{heading}\n\n" node.css("dl.api-params dt").each do |dt| param_name = dt.css("code").first&.text&.strip param_type = dt.css("em").first&.text&.strip required = dt.css(".required").first ? "REQUIRED" : "OPTIONAL" dd = dt.next_element description = dd&.css("p")&.first&.text&.strip || dd&.text&.strip if param_name spec << "- **#{param_name}** (#{param_type}) - #{required}\n" spec << " #{description}\n\n" if description && description.length > 0 end end end end end end # Clean up excessive newlines result = spec.join.gsub(/\n{3,}/, "\n\n").strip (result.length > 50) ? result : nil end # Step 0: Fetch main API page and extract all URLs from sidebar puts "Step 0: Fetching main API page and extracting URLs from sidebar..." main_api_html = fetch_with_curl("#{API_BASE_URL}/") unless main_api_html puts "Error: Could not fetch main API page. Exiting." exit 1 end # Extract top-level category URLs category_urls = extract_api_urls_from_sidebar(main_api_html) puts "Found #{category_urls.count} top-level category URLs" # Step 0.5: For each category page, fetch it and extract nested endpoint URLs puts "\nStep 0.5: Fetching category pages to extract nested endpoint URLs..." all_urls = Set.new(category_urls) category_urls.each do |category_url| next if category_url == "#{API_BASE_URL}/" # Skip the main page, we already have it puts " Fetching category: #{category_url}" category_html = fetch_with_curl(category_url) next unless category_html nested_urls = extract_api_urls_from_sidebar(category_html) nested_urls.each { |url| all_urls.add(url) } puts " Found #{nested_urls.count} URLs in #{category_url}" end URLS = all_urls.to_a.sort puts "\nTotal unique API URLs found: #{URLS.count}" puts "Sample URLs:" URLS.first(10).each { |url| puts " - #{url}" } puts " ..." if URLS.count > 10 if URLS.empty? puts "Warning: No URLs found in sidebar. Exiting." exit 1 end # Step 1: Fetch all HTML files with curl puts "\nStep 1: Fetching HTML files..." URLS.each do |url| fetch_with_curl(url) end # Step 2: Extract body and purify puts "\nStep 2: Extracting and purifying content..." URLS.each do |url| html_file = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html") next unless File.exist?(html_file) html = File.read(html_file) spec = purify_content(html, url) if spec && !spec.strip.empty? filename = File.join(OUTPUT_DIR, "#{sanitize_filename(url)}.md") File.write(filename, spec) puts " Extracted spec: #{File.basename(filename)}" else puts " Warning: Could not extract spec from #{url}" end end puts "\nDone! Specs saved to #{OUTPUT_DIR}" puts "Total URLs processed: #{URLS.count}" # Clean up: Remove raw_html folder puts "\nCleaning up raw HTML files..." if File.directory?(TEMP_DIR) FileUtils.rm_rf(TEMP_DIR) puts "Removed #{TEMP_DIR}" end