#!/usr/bin/env bash
# ──────────────────────────────────────────────────────────────────────────────
# Vast.ai PROVISIONING_SCRIPT for the watermark-remover service.
#
# Host this file at a public URL (GitHub Gist recommended) and set:
#   PROVISIONING_SCRIPT=https://gist.githubusercontent.com/.../provisioning.sh
#
# Required env vars (set in Vast.ai instance template):
#   HF_TOKEN       — HuggingFace access token (for FLUX.1-Kontext-dev)
#   GITHUB_TOKEN   — GitHub PAT with repo read access (to fetch main.py)
#
# Optional env vars:
#   GITHUB_REPO    — owner/repo (default: tendailuke/huren)
#   BRANCH         — git branch (default: main)
#   KONTEXT_MODEL  — HF model ID (default: black-forest-labs/FLUX.1-Kontext-dev)
#   KONTEXT_STEPS  — diffusion steps (default: 28)
#   KONTEXT_MAX_PX — max input image side in px (default: 1024)
#   PORT           — service port (default: 8081)
# ──────────────────────────────────────────────────────────────────────────────
set -euo pipefail

WORKDIR="/workspace/watermark-remover"
GITHUB_REPO="${GITHUB_REPO:-tendailuke/huren}"
BRANCH="${BRANCH:-main}"
KONTEXT_MODEL="${KONTEXT_MODEL:-black-forest-labs/FLUX.1-Kontext-dev}"
PORT="${PORT:-8081}"
HOST="${HOST:-0.0.0.0}"
LOG_FILE="/workspace/watermark-remover.log"
LOG="$LOG_FILE"

mkdir -p /workspace "$WORKDIR"

STATUS_FILE="/tmp/provision-status.json"

# Write a status JSON visible to vastai-watch via the status-server.
set_status() {
    local stage="$1" progress="$2" detail="${3:-}"
    printf '{"status":"loading","stage":"%s","progress":%s,"detail":"%s"}\n' \
        "$stage" "$progress" "$detail" > "$STATUS_FILE"
    echo "[provision] [$stage $progress%] $detail" | tee -a "$LOG_FILE"
}

set_status "starting" 0 "Provisioning started"

# ── Geo detection — enable CN mirrors if running in China ─────────────────────
COUNTRY=$(curl -fsSL --max-time 5 "https://ipinfo.io/country" 2>/dev/null | tr -d '[:space:]' || echo "")
CN_MODE=0
if [[ "$COUNTRY" == "CN" ]]; then
    CN_MODE=1
    echo "[provision] CN instance detected — will use CN mirrors for PyTorch and HuggingFace." | tee -a "$LOG_FILE"
fi

# ── Fetch status-server.py and start it immediately ──────────────────────────
# This occupies PORT so vastai-watch sees "loading" instead of connection refused.
echo "[provision] Fetching status-server.py..." | tee -a "$LOG_FILE"
curl -fsSL \
    -H "Authorization: Bearer ${GITHUB_TOKEN}" \
    -H "Accept: application/vnd.github.raw" \
    "https://api.github.com/repos/${GITHUB_REPO}/contents/services/watermark-remover/status-server.py?ref=${BRANCH}" \
    -o "${WORKDIR}/status-server.py"

STATUS_FILE="$STATUS_FILE" PORT="$PORT" python3 "${WORKDIR}/status-server.py" >> "${LOG%.log}-status-server.log" 2>&1 &
STATUS_SERVER_PID=$!
echo "[provision] Status server PID ${STATUS_SERVER_PID} on :${PORT}" | tee -a "$LOG_FILE"

# ── Explicit venv binaries (never rely on PATH order) ─────────────────────────
VENV="/venv/main"
PIP="$VENV/bin/pip"
PYTHON="$VENV/bin/python3"
UVICORN="$VENV/bin/uvicorn"

# ── Activate venv ─────────────────────────────────────────────────────────────
set_status "venv" 5 "Activating virtual environment"
if . "$VENV/bin/activate"; then
    echo "Virtual environment activated: $("$PYTHON" --version)" | tee -a "$LOG_FILE"
else
    echo "Failed to activate virtual environment" | tee -a "$LOG_FILE"
    exit 1
fi

# ── Change to workspace ────────────────────────────────────────────────────────
if cd /workspace/; then
    pwd | tee -a "$LOG_FILE"
else
    echo "Failed to change directory to /workspace/" | tee -a "$LOG_FILE"
    exit 1
fi

cd "$WORKDIR"

# ── Fetch service code from private repo ──────────────────────────────────────
set_status "fetch-code" 8 "Fetching service code from ${GITHUB_REPO}@${BRANCH}"
if [[ -z "${GITHUB_TOKEN:-}" ]]; then
    echo "[provision] ERROR: GITHUB_TOKEN not set. Cannot fetch service code."
    exit 1
fi

for file in main.py coordinator.py requirements.txt; do
    curl -fsSL \
        -H "Authorization: Bearer ${GITHUB_TOKEN}" \
        -H "Accept: application/vnd.github.raw" \
        "https://api.github.com/repos/${GITHUB_REPO}/contents/services/watermark-remover/${file}?ref=${BRANCH}" \
        -o "${WORKDIR}/${file}"
    echo "[provision] Downloaded ${file}."
done

# ── Install Python deps ───────────────────────────────────────────────────────
# Install torch first with the wheel URL that matches the installed CUDA driver.
# Default PyPI torch wheels target CUDA 12.1+ and fail on older drivers.
echo "[provision] Detecting CUDA driver version..."
CUDA_DRIVER=$(nvidia-smi 2>/dev/null | grep -oP "CUDA Version: \K[0-9]+\.[0-9]+" | head -1 || echo "0.0")
CUDA_MAJOR=$(echo "$CUDA_DRIVER" | cut -d. -f1)
CUDA_MINOR=$(echo "$CUDA_DRIVER" | cut -d. -f2)
CUDA_NUM=$((CUDA_MAJOR * 10 + CUDA_MINOR))  # e.g. 12.1 → 121, 12.0 → 120

if   [ "$CUDA_NUM" -ge 124 ]; then TORCH_CU="cu124"
elif [ "$CUDA_NUM" -ge 121 ]; then TORCH_CU="cu121"
elif [ "$CUDA_NUM" -ge 120 ]; then TORCH_CU="cu121"   # cu120 wheels not published; cu121 works on 12.0 driver with minor mismatch—use cu118 for safety
elif [ "$CUDA_NUM" -ge 118 ]; then TORCH_CU="cu118"
else                                TORCH_CU="cu118"
fi

# CUDA 12.0 driver (520.x) cannot run cu121 wheels — use cu118 which is compatible.
if [ "$CUDA_MAJOR" -eq 12 ] && [ "$CUDA_MINOR" -eq 0 ]; then
    TORCH_CU="cu118"
fi

if [ "$CN_MODE" -eq 1 ]; then
    TORCH_INDEX="https://mirrors.nju.edu.cn/pytorch/whl/${TORCH_CU}"
else
    TORCH_INDEX="https://download.pytorch.org/whl/${TORCH_CU}"
fi

# Check if torch is already installed and CUDA works — skip reinstall if so.
install_torch() {
    set_status "torch" 12 "Installing torch ${TORCH_CU} from ${TORCH_INDEX}"
    "$PIP" install --quiet --no-cache-dir --force-reinstall \
        "torch>=2.3.0" "torchvision" "torchaudio" \
        --index-url "$TORCH_INDEX"
    echo "[provision] torch installed." | tee -a "$LOG_FILE"
}

echo "[provision] Checking existing torch/CUDA..." | tee -a "$LOG_FILE"
if "$PYTHON" -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'" 2>/dev/null; then
    TORCH_VER=$("$PYTHON" -c "import torch; print(torch.__version__)" 2>/dev/null)
    echo "[provision] torch ${TORCH_VER} with CUDA already working — skipping reinstall." | tee -a "$LOG_FILE"
    set_status "torch" 12 "torch ${TORCH_VER} already installed"
else
    echo "[provision] torch/CUDA not working — installing from ${TORCH_INDEX}" | tee -a "$LOG_FILE"
    install_torch
fi

set_status "deps" 20 "Installing Python dependencies"
grep -vE '^torch' requirements.txt | "$PIP" install --quiet --no-cache-dir -r /dev/stdin
echo "[provision] Deps installed."

# ── HuggingFace login ─────────────────────────────────────────────────────────
if [[ -z "${HF_TOKEN:-}" ]]; then
    echo "[provision] ERROR: HF_TOKEN not set. Cannot download ${KONTEXT_MODEL}."
    exit 1
fi
if [ "$CN_MODE" -eq 1 ]; then
    export HF_ENDPOINT="https://hf-mirror.com"
    echo "[provision] CN mode: HF_ENDPOINT=${HF_ENDPOINT}" | tee -a "$LOG_FILE"
fi

set_status "hf-login" 25 "Logging in to HuggingFace"
"$PYTHON" -c "
import os
from huggingface_hub import login
login(token=os.environ['HF_TOKEN'], add_to_git_credential=True)
print('[provision] HuggingFace login OK.')
"

# ── Pre-download model weights (with progress reporting to status-server) ─────
set_status "model-download" 30 "Starting model download ${KONTEXT_MODEL} (~34 GB)"
"$PYTHON" - <<PYEOF
import json, os, time
from pathlib import Path
from huggingface_hub import snapshot_download
from huggingface_hub.utils import tqdm as hf_tqdm

STATUS_FILE = Path(os.getenv("STATUS_FILE", "/tmp/provision-status.json"))
model = os.environ.get("KONTEXT_MODEL", "black-forest-labs/FLUX.1-Kontext-dev")

def write_status(progress: int, detail: str) -> None:
    STATUS_FILE.write_text(json.dumps({
        "status": "loading",
        "stage": "model-download",
        "progress": progress,
        "detail": detail,
    }))

# huggingface_hub fires tqdm callbacks — patch them to also write status.
_orig_init = hf_tqdm.__init__
_total_bytes = [0]
_done_bytes  = [0]

def _patched_init(self, *args, **kwargs):
    _orig_init(self, *args, **kwargs)
    if self.total:
        _total_bytes[0] = max(_total_bytes[0], self.total)

_orig_update = hf_tqdm.update
_last_write = [0.0]

def _patched_update(self, n=1):
    _orig_update(self, n)
    _done_bytes[0] += n or 0
    now = time.monotonic()
    if now - _last_write[0] >= 5 and _total_bytes[0] > 0:
        pct = min(30 + int(60 * _done_bytes[0] / _total_bytes[0]), 89)
        done_gb = _done_bytes[0] / 1024**3
        total_gb = _total_bytes[0] / 1024**3
        write_status(pct, f"Downloading model: {done_gb:.1f}/{total_gb:.1f} GB")
        _last_write[0] = now

hf_tqdm.__init__ = _patched_init
hf_tqdm.update  = _patched_update

write_status(30, f"Downloading {model}...")
snapshot_download(model, ignore_patterns=["*.bin"])
write_status(90, "Model download complete")
print("[provision] Model download complete.")
PYEOF

# ── Start workers (one per GPU) + coordinator ─────────────────────────────────
# Each worker is pinned to one GPU via CUDA_VISIBLE_DEVICES.
# The coordinator on PORT handles routing and exposes the aggregate /health.

GPU_COUNT=$(nvidia-smi --list-gpus 2>/dev/null | wc -l)
GPU_COUNT=$(( GPU_COUNT > 0 ? GPU_COUNT : 1 ))
echo "[provision] GPUs detected: ${GPU_COUNT}"

set_status "vram-check" 91 "Checking per-GPU VRAM"
# ── VRAM check — abort early if any GPU has < 12 GB ──────────────────────────
# Flux.1-Kontext requires ≥12 GB per GPU (4-bit path). vast.ai reports gpu_ram
# as the *total* across all cards, so a 2x RTX 2080 Ti (2×11GB=22GB) would pass
# a gpu_ram>=20 filter but OOM on every request.
MIN_VRAM_GB="${MIN_VRAM_GB:-12}"
echo "[provision] Checking per-GPU VRAM (minimum ${MIN_VRAM_GB} GB)..."
VRAM_FAIL=0
for i in $(seq 0 $((GPU_COUNT - 1))); do
    VRAM_MIB=$(CUDA_VISIBLE_DEVICES=$i nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d ' ')
    VRAM_GB=$(( VRAM_MIB / 1024 ))
    echo "[provision] GPU ${i}: ${VRAM_GB} GB VRAM"
    if [ "$VRAM_GB" -lt "$MIN_VRAM_GB" ]; then
        echo "[provision] ERROR: GPU ${i} has only ${VRAM_GB} GB VRAM — need ≥${MIN_VRAM_GB} GB for Flux.1-Kontext. Aborting."
        VRAM_FAIL=1
    fi
done
if [ "$VRAM_FAIL" -eq 1 ]; then
    exit 1
fi

set_status "starting-workers" 93 "Starting GPU workers"
WORKER_PORTS_LIST=""
for i in $(seq 0 $((GPU_COUNT - 1))); do
    WORKER_PORT=$((PORT + 10 + i))   # e.g. PORT=8081 → workers on 8091, 8092, ...
    WORKER_LOG="${LOG%.log}-gpu${i}.log"

    CUDA_VISIBLE_DEVICES=$i nohup "$UVICORN" main:app \
        --app-dir "$WORKDIR" \
        --host 127.0.0.1 \
        --port "$WORKER_PORT" \
        --workers 1 \
        --log-level info \
        >> "$WORKER_LOG" 2>&1 &

    echo "[provision] GPU ${i} → worker on 127.0.0.1:${WORKER_PORT} (PID $!). Logs: ${WORKER_LOG}"

    [[ -n "$WORKER_PORTS_LIST" ]] && WORKER_PORTS_LIST="${WORKER_PORTS_LIST},"
    WORKER_PORTS_LIST="${WORKER_PORTS_LIST}${WORKER_PORT}"
done

# ── CUDA smoke-test: wait for workers to init, check logs for torch errors ────
# Workers import torch + load the model on startup. If the pre-installed torch
# has a CUDA ABI mismatch, it surfaces as an error within the first ~30s.
# Patterns: "CUDA driver version is insufficient", "no kernel image",
#           "libcuda.so", "CUDA error", "torch.cuda.is_available() returned False"
CUDA_ERROR_PATTERNS="CUDA driver version is insufficient|no kernel image|libcuda\.so|CUDA error|cuda\.is_available.*False|RuntimeError.*CUDA|AssertionError.*CUDA"

echo "[provision] Waiting 30s for workers to initialise (CUDA smoke-test)..." | tee -a "$LOG_FILE"
sleep 30

CUDA_BAD=0
for i in $(seq 0 $((GPU_COUNT - 1))); do
    WORKER_LOG="${LOG%.log}-gpu${i}.log"
    if grep -qE "$CUDA_ERROR_PATTERNS" "$WORKER_LOG" 2>/dev/null; then
        echo "[provision] GPU ${i} worker CUDA error detected in ${WORKER_LOG} — will reinstall torch." | tee -a "$LOG_FILE"
        grep -E "$CUDA_ERROR_PATTERNS" "$WORKER_LOG" | head -3 | tee -a "$LOG_FILE"
        CUDA_BAD=1
    fi
done

if [ "$CUDA_BAD" -eq 1 ]; then
    set_status "torch-retry" 94 "CUDA error detected — reinstalling torch"
    echo "[provision] Killing workers for torch reinstall..." | tee -a "$LOG_FILE"
    # Kill all uvicorn worker processes
    pkill -f "uvicorn main:app" 2>/dev/null || true
    sleep 2

    # Force reinstall torch regardless of CN_MODE (already set above)
    install_torch

    set_status "starting-workers" 95 "Restarting GPU workers after torch reinstall"
    WORKER_PORTS_LIST=""
    for i in $(seq 0 $((GPU_COUNT - 1))); do
        WORKER_PORT=$((PORT + 10 + i))
        WORKER_LOG="${LOG%.log}-gpu${i}.log"
        CUDA_VISIBLE_DEVICES=$i nohup "$UVICORN" main:app \
            --app-dir "$WORKDIR" \
            --host 127.0.0.1 \
            --port "$WORKER_PORT" \
            --workers 1 \
            --log-level info \
            >> "$WORKER_LOG" 2>&1 &
        echo "[provision] GPU ${i} → worker restarted on 127.0.0.1:${WORKER_PORT} (PID $!)" | tee -a "$LOG_FILE"
        [[ -n "$WORKER_PORTS_LIST" ]] && WORKER_PORTS_LIST="${WORKER_PORTS_LIST},"
        WORKER_PORTS_LIST="${WORKER_PORTS_LIST}${WORKER_PORT}"
    done
    echo "[provision] Torch reinstalled and workers restarted." | tee -a "$LOG_FILE"
fi

# ── Hand off PORT from status-server to coordinator ───────────────────────────
set_status "starting-coordinator" 97 "Starting coordinator on :${PORT}"
if kill "$STATUS_SERVER_PID" 2>/dev/null; then
    echo "[provision] Status server (PID ${STATUS_SERVER_PID}) stopped." | tee -a "$LOG_FILE"
    # Brief pause so the OS releases the port before coordinator binds it
    sleep 1
fi

# Coordinator: routes /remove-watermark to idle workers, aggregates /health
WORKER_PORTS="$WORKER_PORTS_LIST" nohup "$UVICORN" coordinator:app \
    --app-dir "$WORKDIR" \
    --host "$HOST" \
    --port "$PORT" \
    --workers 1 \
    --log-level info \
    >> "${LOG%.log}-coordinator.log" 2>&1 &

echo "[provision] Coordinator on ${HOST}:${PORT} → workers [${WORKER_PORTS_LIST}] (PID $!)"
echo "[provision] Logs: tail -f ${LOG%.log}*.log"
echo "[provision] Done."