ynott · April 7, 2026 04:11 · ynott · Apr 7, 2026
diff --git a/test-ollama-embed-crash.sh b/test-ollama-embed-crash.sh
 #!/usr/bin/env bash
 #
 # Reproduction test script for Ollama /api/embed crash
 # with jeffh/intfloat-multilingual-e5-small:q8_0
 #
 # v2: removed jq dependency, JSON is escaped in pure bash.
 #
 # Usage:
 #   ./test_ollama_embed_crash_v2.sh [OLLAMA_HOST] [MODEL]
 #
 # Examples:
 #   ./test_ollama_embed_crash_v2.sh
 #   ./test_ollama_embed_crash_v2.sh http://172.16.203.123:11434
 #   ./test_ollama_embed_crash_v2.sh http://localhost:11434 bge-m3
 #
 # Exit code: number of crashed test cases.

 set -u

 OLLAMA_HOST="${1:-http://localhost:11434}"
 MODEL="${2:-jeffh/intfloat-multilingual-e5-small:q8_0}"
 ENDPOINT="${OLLAMA_HOST%/}/api/embed"

 # Colors (disabled if not a tty)
 if [ -t 1 ]; then
    GREEN='\033[0;32m'
    RED='\033[0;31m'
    YELLOW='\033[0;33m'
    BOLD='\033[1m'
    RESET='\033[0m'
 else
    GREEN='' RED='' YELLOW='' BOLD='' RESET=''
 fi

 echo -e "${BOLD}Ollama /api/embed crash reproduction test (v2)${RESET}"
 echo "Endpoint: $ENDPOINT"
 echo "Model:    $MODEL"
 echo

 # Pre-flight: check the endpoint is reachable
 if ! curl -s -o /dev/null -w '%{http_code}' "${OLLAMA_HOST%/}/api/tags" | grep -q '^200$'; then
    echo -e "${RED}ERROR:${RESET} cannot reach $OLLAMA_HOST/api/tags" >&2
    echo "Make sure Ollama is running and reachable." >&2
    exit 255
 fi

 # JSON string escape in pure bash.
 # Handles: backslash, double quote, control chars (\b \f \n \r \t).
 # Multibyte UTF-8 bytes are passed through unchanged, which is valid JSON.
 json_escape() {
    local s="$1"
    s="${s//\\/\\\\}"   # \ -> \\
    s="${s//\"/\\\"}"   # " -> \"
    s="${s//$'\b'/\\b}"
    s="${s//$'\f'/\\f}"
    s="${s//$'\n'/\\n}"
    s="${s//$'\r'/\\r}"
    s="${s//$'\t'/\\t}"
    printf '"%s"' "$s"
 }

 # Test cases: label | input | expected (ok|crash)
 # "expected" reflects the behavior observed with
 # jeffh/intfloat-multilingual-e5-small:q8_0 on Ollama v0.20.2.
 # For other models (e.g. bge-m3) all cases are expected to pass ("ok").
 TESTS=(
    "ASCII single char         |.                  |ok"
    "ASCII two chars           |ab                 |ok"
    "ASCII word                |est                |ok"
    "Single katakana 'te'      |テ                 |ok"
    "Single katakana 'su'      |ス                 |ok"
    "Katakana 'suta'           |スタ               |ok"
    "Katakana 'tesu'           |テス               |crash"
    "Katakana 'kana'           |カナ               |crash"
    "Single hiragana 'a'       |あ                 |ok"
    "Hiragana 'ai'             |あい               |ok"
    "Kanji two chars 'nihon'   |日本               |ok"
    "Kanji three 'nihongo'     |日本語             |ok"
    "Ideographic full stop     |。                 |crash"
    "Ideographic comma         |、                 |crash"
    "Japanese sentence         |テスト文章です。   |crash"
    "e5 prefixed sentence      |query: テスト文章です。 |crash"
    "Spaced ideographs         |日 本 語           |ok"
 )

 PASS=0
 FAIL=0
 UNEXPECTED=0
 CRASH_INPUTS=()

 probe() {
    local input="$1"
    local model_json input_json payload body

    model_json=$(json_escape "$MODEL")
    input_json=$(json_escape "$input")
    payload="{\"model\":${model_json},\"input\":${input_json}}"

    body=$(curl -sS -X POST "$ENDPOINT" \
        -H 'Content-Type: application/json' \
        --max-time 30 \
        -d "$payload" 2>&1)

    if echo "$body" | grep -q '"embeddings"'; then
        echo "ok"
    else
        echo "crash"
    fi
 }

 printf "${BOLD}%-30s | %-25s | %-8s | %s${RESET}\n" "Label" "Input" "Expected" "Actual"
 printf -- '-%.0s' {1..90}; echo

 for entry in "${TESTS[@]}"; do
    IFS='|' read -r label input expected <<<"$entry"
    label="$(echo "$label" | sed 's/ *$//')"
    input="$(echo "$input"  | sed 's/^ *//;s/ *$//')"
    expected="$(echo "$expected" | sed 's/^ *//;s/ *$//')"

    actual=$(probe "$input")

    if [ "$actual" = "$expected" ]; then
        color="$GREEN"; tag="MATCH "
        PASS=$((PASS+1))
    else
        color="$YELLOW"; tag="DIFFER"
        UNEXPECTED=$((UNEXPECTED+1))
    fi

    if [ "$actual" = "crash" ]; then
        FAIL=$((FAIL+1))
        CRASH_INPUTS+=("$label :: $input")
    fi

    printf "${color}%-30s | %-25s | %-8s | %s [%s]${RESET}\n" \
        "$label" "$input" "$expected" "$actual" "$tag"
 done

 echo
 printf -- '-%.0s' {1..90}; echo
 echo -e "${BOLD}Summary${RESET}"
 echo "  Total tests : ${#TESTS[@]}"
 echo -e "  ${GREEN}OK${RESET}          : $((${#TESTS[@]} - FAIL))"
 echo -e "  ${RED}Crashed${RESET}     : $FAIL"
 echo -e "  ${YELLOW}Unexpected${RESET}  : $UNEXPECTED  (actual differs from expected)"

 if [ "$FAIL" -gt 0 ]; then
    echo
    echo -e "${BOLD}Crashed inputs:${RESET}"
    for item in "${CRASH_INPUTS[@]}"; do
        echo "  - $item"
    done
 fi

 echo
 if [ "$UNEXPECTED" -eq 0 ] && [ "$FAIL" -gt 0 ]; then
    echo "Reproduction confirmed: this matches the documented crash pattern for"
    echo "jeffh/intfloat-multilingual-e5-small:q8_0 on Ollama new-engine builds."
 elif [ "$FAIL" -eq 0 ]; then
    echo "All inputs returned embeddings successfully. This model is not affected."
 else
    echo "Partial reproduction. Some cases differ from the reference pattern;"
    echo "behavior may vary by Ollama version or model build."
 fi

 exit "$FAIL"
	#!/usr/bin/env bash
	#
	# Reproduction test script for Ollama /api/embed crash
	# with jeffh/intfloat-multilingual-e5-small:q8_0
	#
	# v2: removed jq dependency, JSON is escaped in pure bash.
	#
	# Usage:
	# ./test_ollama_embed_crash_v2.sh [OLLAMA_HOST] [MODEL]
	#
	# Examples:
	# ./test_ollama_embed_crash_v2.sh
	# ./test_ollama_embed_crash_v2.sh http://172.16.203.123:11434
	# ./test_ollama_embed_crash_v2.sh http://localhost:11434 bge-m3
	#
	# Exit code: number of crashed test cases.

	set -u

	OLLAMA_HOST="${1:-http://localhost:11434}"
	MODEL="${2:-jeffh/intfloat-multilingual-e5-small:q8_0}"
	ENDPOINT="${OLLAMA_HOST%/}/api/embed"

	# Colors (disabled if not a tty)
	if [ -t 1 ]; then
	GREEN='\033[0;32m'
	RED='\033[0;31m'
	YELLOW='\033[0;33m'
	BOLD='\033[1m'
	RESET='\033[0m'
	else
	GREEN='' RED='' YELLOW='' BOLD='' RESET=''
	fi

	echo -e "${BOLD}Ollama /api/embed crash reproduction test (v2)${RESET}"
	echo "Endpoint: $ENDPOINT"
	echo "Model: $MODEL"
	echo

	# Pre-flight: check the endpoint is reachable
	if ! curl -s -o /dev/null -w '%{http_code}' "${OLLAMA_HOST%/}/api/tags" \| grep -q '^200$'; then
	echo -e "${RED}ERROR:${RESET} cannot reach $OLLAMA_HOST/api/tags" >&2
	echo "Make sure Ollama is running and reachable." >&2
	exit 255
	fi

	# JSON string escape in pure bash.
	# Handles: backslash, double quote, control chars (\b \f \n \r \t).
	# Multibyte UTF-8 bytes are passed through unchanged, which is valid JSON.
	json_escape() {
	local s="$1"
	s="${s//\\/\\\\}" # \ -> \\
	s="${s//\"/\\\"}" # " -> \"
	s="${s//$'\b'/\\b}"
	s="${s//$'\f'/\\f}"
	s="${s//$'\n'/\\n}"
	s="${s//$'\r'/\\r}"
	s="${s//$'\t'/\\t}"
	printf '"%s"' "$s"
	}

	# Test cases: label \| input \| expected (ok\|crash)
	# "expected" reflects the behavior observed with
	# jeffh/intfloat-multilingual-e5-small:q8_0 on Ollama v0.20.2.
	# For other models (e.g. bge-m3) all cases are expected to pass ("ok").
	TESTS=(
	"ASCII single char \|. \|ok"
	"ASCII two chars \|ab \|ok"
	"ASCII word \|est \|ok"
	"Single katakana 'te' \|テ \|ok"
	"Single katakana 'su' \|ス \|ok"
	"Katakana 'suta' \|スタ \|ok"
	"Katakana 'tesu' \|テス \|crash"
	"Katakana 'kana' \|カナ \|crash"
	"Single hiragana 'a' \|あ \|ok"
	"Hiragana 'ai' \|あい \|ok"
	"Kanji two chars 'nihon' \|日本 \|ok"
	"Kanji three 'nihongo' \|日本語 \|ok"
	"Ideographic full stop \|。 \|crash"
	"Ideographic comma \|、 \|crash"
	"Japanese sentence \|テスト文章です。 \|crash"
	"e5 prefixed sentence \|query: テスト文章です。 \|crash"
	"Spaced ideographs \|日本語 \|ok"
	)

	PASS=0
	FAIL=0
	UNEXPECTED=0
	CRASH_INPUTS=()

	probe() {
	local input="$1"
	local model_json input_json payload body

	model_json=$(json_escape "$MODEL")
	input_json=$(json_escape "$input")
	payload="{\"model\":${model_json},\"input\":${input_json}}"

	body=$(curl -sS -X POST "$ENDPOINT" \
	-H 'Content-Type: application/json' \
	--max-time 30 \
	-d "$payload" 2>&1)

	if echo "$body" \| grep -q '"embeddings"'; then
	echo "ok"
	else
	echo "crash"
	fi
	}

	printf "${BOLD}%-30s \| %-25s \| %-8s \| %s${RESET}\n" "Label" "Input" "Expected" "Actual"
	printf -- '-%.0s' {1..90}; echo

	for entry in "${TESTS[@]}"; do
	IFS='\|' read -r label input expected <<<"$entry"
	label="$(echo "$label" \| sed 's/ *$//')"
	input="$(echo "$input" \| sed 's/^ //;s/ $//')"
	expected="$(echo "$expected" \| sed 's/^ //;s/ $//')"

	actual=$(probe "$input")

	if [ "$actual" = "$expected" ]; then
	color="$GREEN"; tag="MATCH "
	PASS=$((PASS+1))
	else
	color="$YELLOW"; tag="DIFFER"
	UNEXPECTED=$((UNEXPECTED+1))
	fi

	if [ "$actual" = "crash" ]; then
	FAIL=$((FAIL+1))
	CRASH_INPUTS+=("$label :: $input")
	fi

	printf "${color}%-30s \| %-25s \| %-8s \| %s [%s]${RESET}\n" \
	"$label" "$input" "$expected" "$actual" "$tag"
	done

	echo
	printf -- '-%.0s' {1..90}; echo
	echo -e "${BOLD}Summary${RESET}"
	echo " Total tests : ${#TESTS[@]}"
	echo -e " ${GREEN}OK${RESET} : $((${#TESTS[@]} - FAIL))"
	echo -e " ${RED}Crashed${RESET} : $FAIL"
	echo -e " ${YELLOW}Unexpected${RESET} : $UNEXPECTED (actual differs from expected)"

	if [ "$FAIL" -gt 0 ]; then
	echo
	echo -e "${BOLD}Crashed inputs:${RESET}"
	for item in "${CRASH_INPUTS[@]}"; do
	echo " - $item"
	done
	fi

	echo
	if [ "$UNEXPECTED" -eq 0 ] && [ "$FAIL" -gt 0 ]; then
	echo "Reproduction confirmed: this matches the documented crash pattern for"
	echo "jeffh/intfloat-multilingual-e5-small:q8_0 on Ollama new-engine builds."
	elif [ "$FAIL" -eq 0 ]; then
	echo "All inputs returned embeddings successfully. This model is not affected."
	else
	echo "Partial reproduction. Some cases differ from the reference pattern;"
	echo "behavior may vary by Ollama version or model build."
	fi

	exit "$FAIL"
No results found