oneryalcin · March 4, 2026 18:04
diff --git a/crawl-claude-docs.sh b/crawl-claude-docs.sh
 #!/bin/bash
 # Crawl and download Claude platform docs as markdown

 set -e

 BASE_URL="https://platform.claude.com/docs/en"
 SITEMAP_URL="https://platform.claude.com/sitemap.xml"
 WORK_DIR="/tmp/claude-docs"
 OUT_DIR="$WORK_DIR/docs"
 URLS_FILE="$WORK_DIR/urls.txt"

 usage() {
    cat <<EOF
 Usage: $(basename "$0") [--all | <filter>]

 Download Claude platform docs as markdown files.

 Options:
  --all           Download all docs (~530 files)
  --help, -h      Show this help
  <filter>        Filter by path prefix

 Examples:
  $(basename "$0") --all                              # all docs
  $(basename "$0") agent-sdk                          # agent-sdk docs only
  $(basename "$0") api/python                         # python API docs
  $(basename "$0") build-with-claude/prompt-engineering

 Output: $OUT_DIR
 EOF
    exit 0
 }

 # No args or help flag
 if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then
    usage
 fi

 FILTER=""

 if [[ "$1" == "--all" ]]; then
    FILTER=""
 else
    FILTER="$1"
 fi

 mkdir -p "$WORK_DIR" "$OUT_DIR"

 # Step 1: Fetch sitemap (cached)
 if [[ ! -f "$URLS_FILE" ]]; then
    echo "Fetching sitemap..."
    curl -s "$SITEMAP_URL" | grep -oE 'https://platform\.claude\.com/docs/en/[^<]+' > "$URLS_FILE"
 fi

 # Step 2: Filter URLs
 CLEAN_URLS=$(cat "$URLS_FILE" | sort -u)

 if [[ -n "$FILTER" ]]; then
    CLEAN_URLS=$(echo "$CLEAN_URLS" | grep "/docs/en/${FILTER}" || true)
    if [[ -z "$CLEAN_URLS" ]]; then
        echo "No URLs found matching filter: $FILTER"
        exit 1
    fi
    echo "Filter: $FILTER"
 fi

 URL_COUNT=$(echo "$CLEAN_URLS" | wc -l | tr -d ' ')
 echo "Found $URL_COUNT URLs"

 # Step 3: Pre-create dirs and filter already-downloaded
 TODO_FILE="$WORK_DIR/todo_urls.txt"
 > "$TODO_FILE"
 SKIPPED=0

 while read -r url; do
    [[ -z "$url" ]] && continue
    path="${url#https://platform.claude.com/docs/en/}"
    dir="$OUT_DIR/$(dirname "$path")"
    file="$OUT_DIR/${path}.md"
    mkdir -p "$dir"
    if [[ -f "$file" ]]; then
        ((SKIPPED++))
    else
        echo "$url" >> "$TODO_FILE"
    fi
 done <<< "$CLEAN_URLS"

 TODO_COUNT=$(wc -l < "$TODO_FILE" | tr -d ' ')
 echo "To download: $TODO_COUNT, Already cached: $SKIPPED"

 # Step 4: Parallel download
 CONCURRENCY="${CRAWL_CONCURRENCY:-10}"

 download_one() {
    local url="$1"
    local path="${url#https://platform.claude.com/docs/en/}"
    local file="$OUT_DIR/${path}.md"
    curl -s "${url}.md" -o "$file" && echo "OK: $path" || echo "FAIL: $path"
 }
 export -f download_one
 export OUT_DIR

 if [[ "$TODO_COUNT" -gt 0 ]]; then
    cat "$TODO_FILE" | xargs -P "$CONCURRENCY" -I {} bash -c 'download_one "$@"' _ {}
 fi

 echo ""
 echo "Downloaded: $TODO_COUNT, Skipped: $SKIPPED"
 echo "Saved to: $OUT_DIR"
	#!/bin/bash
	# Crawl and download Claude platform docs as markdown

	set -e

	BASE_URL="https://platform.claude.com/docs/en"
	SITEMAP_URL="https://platform.claude.com/sitemap.xml"
	WORK_DIR="/tmp/claude-docs"
	OUT_DIR="$WORK_DIR/docs"
	URLS_FILE="$WORK_DIR/urls.txt"

	usage() {
	cat <<EOF
	Usage: $(basename "$0") [--all \| <filter>]

	Download Claude platform docs as markdown files.

	Options:
	--all Download all docs (~530 files)
	--help, -h Show this help
	<filter> Filter by path prefix

	Examples:
	$(basename "$0") --all # all docs
	$(basename "$0") agent-sdk # agent-sdk docs only
	$(basename "$0") api/python # python API docs
	$(basename "$0") build-with-claude/prompt-engineering

	Output: $OUT_DIR
	EOF
	exit 0
	}

	# No args or help flag
	if [[ $# -eq 0 \|\| "$1" == "--help" \|\| "$1" == "-h" ]]; then
	usage
	fi

	FILTER=""

	if [[ "$1" == "--all" ]]; then
	FILTER=""
	else
	FILTER="$1"
	fi

	mkdir -p "$WORK_DIR" "$OUT_DIR"

	# Step 1: Fetch sitemap (cached)
	if [[ ! -f "$URLS_FILE" ]]; then
	echo "Fetching sitemap..."
	curl -s "$SITEMAP_URL" \| grep -oE 'https://platform\.claude\.com/docs/en/[^<]+' > "$URLS_FILE"
	fi

	# Step 2: Filter URLs
	CLEAN_URLS=$(cat "$URLS_FILE" \| sort -u)

	if [[ -n "$FILTER" ]]; then
	CLEAN_URLS=$(echo "$CLEAN_URLS" \| grep "/docs/en/${FILTER}" \|\| true)
	if [[ -z "$CLEAN_URLS" ]]; then
	echo "No URLs found matching filter: $FILTER"
	exit 1
	fi
	echo "Filter: $FILTER"
	fi

	URL_COUNT=$(echo "$CLEAN_URLS" \| wc -l \| tr -d ' ')
	echo "Found $URL_COUNT URLs"

	# Step 3: Pre-create dirs and filter already-downloaded
	TODO_FILE="$WORK_DIR/todo_urls.txt"
	> "$TODO_FILE"
	SKIPPED=0

	while read -r url; do
	[[ -z "$url" ]] && continue
	path="${url#https://platform.claude.com/docs/en/}"
	dir="$OUT_DIR/$(dirname "$path")"
	file="$OUT_DIR/${path}.md"
	mkdir -p "$dir"
	if [[ -f "$file" ]]; then
	((SKIPPED++))
	else
	echo "$url" >> "$TODO_FILE"
	fi
	done <<< "$CLEAN_URLS"

	TODO_COUNT=$(wc -l < "$TODO_FILE" \| tr -d ' ')
	echo "To download: $TODO_COUNT, Already cached: $SKIPPED"

	# Step 4: Parallel download
	CONCURRENCY="${CRAWL_CONCURRENCY:-10}"

	download_one() {
	local url="$1"
	local path="${url#https://platform.claude.com/docs/en/}"
	local file="$OUT_DIR/${path}.md"
	curl -s "${url}.md" -o "$file" && echo "OK: $path" \|\| echo "FAIL: $path"
	}
	export -f download_one
	export OUT_DIR

	if [[ "$TODO_COUNT" -gt 0 ]]; then
	cat "$TODO_FILE" \| xargs -P "$CONCURRENCY" -I {} bash -c 'download_one "$@"' _ {}
	fi

	echo ""
	echo "Downloaded: $TODO_COUNT, Skipped: $SKIPPED"
	echo "Saved to: $OUT_DIR"
No results found