Skip to content

Instantly share code, notes, and snippets.

@oneryalcin
Last active March 4, 2026 18:04
Show Gist options
  • Select an option

  • Save oneryalcin/30138206a2794c978b71ab6411d557bb to your computer and use it in GitHub Desktop.

Select an option

Save oneryalcin/30138206a2794c978b71ab6411d557bb to your computer and use it in GitHub Desktop.
Crawling Claude Developer Page
#!/bin/bash
# Crawl and download Claude platform docs as markdown
set -e
BASE_URL="https://platform.claude.com/docs/en"
SITEMAP_URL="https://platform.claude.com/sitemap.xml"
WORK_DIR="/tmp/claude-docs"
OUT_DIR="$WORK_DIR/docs"
URLS_FILE="$WORK_DIR/urls.txt"
usage() {
cat <<EOF
Usage: $(basename "$0") [--all | <filter>]
Download Claude platform docs as markdown files.
Options:
--all Download all docs (~530 files)
--help, -h Show this help
<filter> Filter by path prefix
Examples:
$(basename "$0") --all # all docs
$(basename "$0") agent-sdk # agent-sdk docs only
$(basename "$0") api/python # python API docs
$(basename "$0") build-with-claude/prompt-engineering
Output: $OUT_DIR
EOF
exit 0
}
# No args or help flag
if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then
usage
fi
FILTER=""
if [[ "$1" == "--all" ]]; then
FILTER=""
else
FILTER="$1"
fi
mkdir -p "$WORK_DIR" "$OUT_DIR"
# Step 1: Fetch sitemap (cached)
if [[ ! -f "$URLS_FILE" ]]; then
echo "Fetching sitemap..."
curl -s "$SITEMAP_URL" | grep -oE 'https://platform\.claude\.com/docs/en/[^<]+' > "$URLS_FILE"
fi
# Step 2: Filter URLs
CLEAN_URLS=$(cat "$URLS_FILE" | sort -u)
if [[ -n "$FILTER" ]]; then
CLEAN_URLS=$(echo "$CLEAN_URLS" | grep "/docs/en/${FILTER}" || true)
if [[ -z "$CLEAN_URLS" ]]; then
echo "No URLs found matching filter: $FILTER"
exit 1
fi
echo "Filter: $FILTER"
fi
URL_COUNT=$(echo "$CLEAN_URLS" | wc -l | tr -d ' ')
echo "Found $URL_COUNT URLs"
# Step 3: Pre-create dirs and filter already-downloaded
TODO_FILE="$WORK_DIR/todo_urls.txt"
> "$TODO_FILE"
SKIPPED=0
while read -r url; do
[[ -z "$url" ]] && continue
path="${url#https://platform.claude.com/docs/en/}"
dir="$OUT_DIR/$(dirname "$path")"
file="$OUT_DIR/${path}.md"
mkdir -p "$dir"
if [[ -f "$file" ]]; then
((SKIPPED++))
else
echo "$url" >> "$TODO_FILE"
fi
done <<< "$CLEAN_URLS"
TODO_COUNT=$(wc -l < "$TODO_FILE" | tr -d ' ')
echo "To download: $TODO_COUNT, Already cached: $SKIPPED"
# Step 4: Parallel download
CONCURRENCY="${CRAWL_CONCURRENCY:-10}"
download_one() {
local url="$1"
local path="${url#https://platform.claude.com/docs/en/}"
local file="$OUT_DIR/${path}.md"
curl -s "${url}.md" -o "$file" && echo "OK: $path" || echo "FAIL: $path"
}
export -f download_one
export OUT_DIR
if [[ "$TODO_COUNT" -gt 0 ]]; then
cat "$TODO_FILE" | xargs -P "$CONCURRENCY" -I {} bash -c 'download_one "$@"' _ {}
fi
echo ""
echo "Downloaded: $TODO_COUNT, Skipped: $SKIPPED"
echo "Saved to: $OUT_DIR"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment