cfregly · April 30, 2026 11:07
diff --git a/0001-runner-GAMBLE_SKIP_K8S-1-no-k8s-context-escape-hatch.patch b/0001-runner-GAMBLE_SKIP_K8S-1-no-k8s-context-escape-hatch.patch
 From 33f13f5c6cf75f8afa1d6a79983ab96d3eeb419d Mon Sep 17 00:00:00 2001
 From: Chris Fregly <cfregly@coreweave.com>
 Date: Thu, 30 Apr 2026 04:06:52 -0700
 Subject: [PATCH] runner: GAMBLE_SKIP_K8S=1 / --no-k8s-context escape hatch for
 Slurm-side runs (#276)

 gamble/runner.py hardcodes a `kubectl get node` probe in build_node_context().
 On Slurm-side runs (where no Kubernetes apiserver is reachable from the
 compute node), this fails with the canonical:

    [ERROR] [1107] Failed to load Kubernetes context: NODE_NAME environment
    variable not set
    -- or --
    couldn't get current server API group list: Get "http://localhost:8080/...

 and aborts the entire suite. Operators currently work around this by
 mounting a kubectl shim that lies about labels.

 This change adds a first-class escape hatch:

  - GAMBLE_SKIP_K8S=1 (or true/yes, case-insensitive) env var
  - --no-k8s-context CLI flag (threads through RunOptions.no_k8s_context)
  - build_node_context(skip_k8s=True) Python API

 When the escape hatch is active:
  - The kubectl probe is bypassed entirely
  - node_name is sourced from NODE_NAME, falling back to socket.gethostname()
  - context is marked with k8s_context_skipped=True
  - Per-check filters that gate on K8s labels (match_labels /
    matching_labels / dismatch_labels) are short-circuit-skipped with
    a structured INFO log line, so the operator can audit which checks
    were dropped instead of having every gated test silently dismissed
    as "label not found"

 Tests: 6 new cases in tests/test_runner_skip_k8s.py covering env var,
 CLI flag, default-preserved behavior, the helper truth table, and
 end-to-end run_suite with a match_labels-gated check that gets skipped.

 The existing kubectl-probe failure path (BootstrapError -> ERR_GAMBLE_CORE_RUNNER_K8S_UNREACHABLE)
 is preserved unchanged for K8s-side runs that actually need K8s context.

 validate_suite/render_suite are intentionally not threaded with skip_k8s
 in this change to keep the diff small; they fall back to the existing
 BootstrapError path if the kubectl probe fails.

 Closes #276
 ---
 gamble/cli.py                 |  11 +++
 gamble/models/run_options.py  |   1 +
 gamble/runner.py              |  61 +++++++++++--
 tests/test_runner_skip_k8s.py | 162 ++++++++++++++++++++++++++++++++++
 4 files changed, 230 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_runner_skip_k8s.py

 diff --git a/gamble/cli.py b/gamble/cli.py
 index 8d3c85b..869b02d 100644
 --- a/gamble/cli.py
 +++ b/gamble/cli.py
 @@ -164,6 +164,17 @@ def _parse_label_filters(label_filter_args: tuple) -> dict:
 @click.option("--patch", is_flag=True, help="Patch Kubernetes node status (Fail on first failure, Pass on success)")
 @click.option("--label-filter", "-l", multiple=True, help="Filter tests by label (key=value). Multiple filters use AND logic.")
 @click.option("--run-id", type=click.UUID, default=generate_run_id, help="UUID run identifier. A UUIDv7 is auto-generated if not provided.")
 +@click.option(
 +    "--no-k8s-context",
 +    is_flag=True,
 +    default=False,
 +    help=(
 +        "Skip the kubectl node-context probe. Use on Slurm-side runs where "
 +        "no Kubernetes apiserver is reachable from the compute node. Tests "
 +        "that gate on K8s labels (match_labels/matching_labels/dismatch_labels) "
 +        "are skipped with a structured log line. Equivalent to GAMBLE_SKIP_K8S=1."
 +    ),
 +)
 @click.pass_context
 def run(ctx, output, context, label_filter, **kwargs):
     """Run a test suite from a configuration file.
 diff --git a/gamble/models/run_options.py b/gamble/models/run_options.py
 index 3c4f1c0..2176ce7 100644
 --- a/gamble/models/run_options.py
 +++ b/gamble/models/run_options.py
 @@ -29,3 +29,4 @@ class RunOptions:
     format: str = "text"
     label_filters: Dict[str, str] = field(default_factory=dict)
     run_id: UUID = field(default_factory=generate_run_id)
 +    no_k8s_context: bool = False
 diff --git a/gamble/runner.py b/gamble/runner.py
 index f36c480..83b8938 100644
 --- a/gamble/runner.py
 +++ b/gamble/runner.py
 @@ -101,9 +101,30 @@ def load_plugins() -> Dict[str, Any]:
     return plugins
 
 
 +def _is_k8s_context_skipped(skip_k8s: bool) -> bool:
 +    """Resolve whether the Kubernetes context probe should be skipped.
 +
 +    Honors GAMBLE_SKIP_K8S=1/true/yes (case-insensitive) and the explicit
 +    ``skip_k8s`` argument (set via ``--no-k8s-context`` / ``RunOptions``).
 +    """
 +    if skip_k8s:
 +        return True
 +    return os.environ.get("GAMBLE_SKIP_K8S", "").strip().lower() in {"1", "true", "yes"}
 +
 +
 @trace
 -def build_node_context() -> Dict[str, Any]:
 -    """Build node context information from Kubernetes node labels and environment."""
 +def build_node_context(skip_k8s: bool = False) -> Dict[str, Any]:
 +    """Build node context information from Kubernetes node labels and environment.
 +
 +    When ``skip_k8s`` is True or ``GAMBLE_SKIP_K8S`` is truthy in the
 +    environment, the kubectl probe is bypassed entirely. This unblocks
 +    Slurm-side runs where no Kubernetes apiserver is reachable from the
 +    compute node. Plugins still receive a ``node_name`` (from ``NODE_NAME``
 +    or, as a fallback, the hostname) and any context loaded via
 +    ``NODE_CONTEXT_FILE``, but no K8s labels are injected. Per-check label
 +    filters are short-circuit-skipped instead of being evaluated against an
 +    empty label dict (see the per-check filter block in ``run_suite``).
 +    """
     # Check if we should use a context file instead of Kubernetes
     context_file = os.environ.get("NODE_CONTEXT_FILE")
     if context_file:
 @@ -112,6 +133,19 @@ def build_node_context() -> Dict[str, Any]:
         logger.info("Successfully loaded context from file")
         return context
 
 +    if _is_k8s_context_skipped(skip_k8s):
 +        node_name = os.environ.get("NODE_NAME", "").strip()
 +        if not node_name:
 +            try:
 +                import socket
 +                node_name = socket.gethostname()
 +            except Exception:
 +                node_name = "unknown"
 +        logger.info(
 +            f"Skipping Kubernetes context probe (GAMBLE_SKIP_K8S=1 / --no-k8s-context); node_name={node_name}"
 +        )
 +        return {"node_name": node_name, "k8s_context_skipped": True}
 +
     try:
         # Get node name from environment variable
         node_name = os.environ.get("NODE_NAME", "")
 @@ -552,11 +586,14 @@ class BootstrapError(Exception):
         super().__init__(result.get("error_message", "bootstrap failed"))
 
 
 -def _bootstrap(config_path: Path) -> tuple[ValidationSuite, Dict, Dict]:
 +def _bootstrap(config_path: Path, skip_k8s: bool = False) -> tuple[ValidationSuite, Dict, Dict]:
     """Common setup method for both run_suite() and validate_suite()
 
     Args:
         config_path: Path to the YAML configuration file containing test definitions.
 +        skip_k8s: When True (or GAMBLE_SKIP_K8S is truthy), bypass the
 +            kubectl probe in build_node_context(). Slurm-side compatibility
 +            knob -- see #276.
 
     Returns:
         Tuple containing config, node_context, and plugins
 @@ -568,7 +605,7 @@ def _bootstrap(config_path: Path) -> tuple[ValidationSuite, Dict, Dict]:
     """
     # Build execution context first (needed for match_labels evaluation)
     try:
 -        node_context = build_node_context()
 +        node_context = build_node_context(skip_k8s=skip_k8s)
     except RuntimeError as e:
         error_code = ErrorCode.ERR_GAMBLE_CORE_RUNNER_K8S_UNREACHABLE
         error_message = _format_error_message(error_code, str(e))
 @@ -719,7 +756,7 @@ def run_suite(opts: RunOptions) -> Dict[str, Any]:
     setup_logging(opts.verbose)
     # Prepare runtime objects
     try:
 -        suite, node_context, plugins = _bootstrap(opts.config_path)
 +        suite, node_context, plugins = _bootstrap(opts.config_path, skip_k8s=opts.no_k8s_context)
     except BootstrapError as e:
         if opts.patch:
             patch_node(
 @@ -804,6 +841,20 @@ def run_suite(opts: RunOptions) -> Dict[str, Any]:
                 logger.info(f"Skipping disabled test: {check.name}")
                 continue
 
 +            # When the kubectl probe was skipped (Slurm-side runs / #276),
 +            # checks that gate on K8s labels can't be evaluated. Skip them
 +            # explicitly with a structured log line so the operator can audit
 +            # which checks were dropped, instead of evaluating every label
 +            # filter against an empty dict.
 +            k8s_skipped = bool(node_context.get("k8s_context_skipped"))
 +            if k8s_skipped and (check.match_labels or check.matching_labels or check.dismatch_labels):
 +                logger.info(
 +                    f"Skipping test due to k8s_context_skipped: {check.name} "
 +                    f"(label filters cannot be evaluated without K8s labels; "
 +                    f"GAMBLE_SKIP_K8S=1 / --no-k8s-context)"
 +                )
 +                continue
 +
             # Check test-level match_labels
             if check.match_labels:
                 if not _matches_labels(node_context, check.match_labels):
 diff --git a/tests/test_runner_skip_k8s.py b/tests/test_runner_skip_k8s.py
 new file mode 100644
 index 0000000..1393ed2
 --- /dev/null
 +++ b/tests/test_runner_skip_k8s.py
 @@ -0,0 +1,162 @@
 +"""Tests for the GAMBLE_SKIP_K8S=1 / --no-k8s-context escape hatch (#276).
 +
 +The escape hatch unblocks Slurm-side runs by bypassing the kubectl node-context
 +probe in build_node_context(). Per-check filters that gate on K8s labels
 +(match_labels / matching_labels / dismatch_labels) are short-circuit-skipped
 +with a structured log line instead of being evaluated against an empty label
 +dict.
 +"""
 +
 +import os
 +import tempfile
 +from pathlib import Path
 +from unittest.mock import Mock, patch
 +
 +import pytest
 +import yaml
 +
 +from gamble.models import EvaluationResult, EvaluationStatus, PluginResult, RunOptions
 +from gamble.runner import _is_k8s_context_skipped, build_node_context, run_suite
 +
 +
 +class TestSkipK8sEscapeHatch:
 +    """Verify GAMBLE_SKIP_K8S=1 + --no-k8s-context bypass the kubectl probe."""
 +
 +    def test_env_var_truthy_skips_k8s_probe(self):
 +        """Case 1: GAMBLE_SKIP_K8S=1 -> _fetch_k8s_node_labels is never called."""
 +        with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch:
 +            with patch.dict(
 +                os.environ,
 +                {"GAMBLE_SKIP_K8S": "1", "NODE_NAME": "slurm-host-001"},
 +                clear=True,
 +            ):
 +                ctx = build_node_context()
 +        mock_fetch.assert_not_called()
 +        assert ctx["node_name"] == "slurm-host-001"
 +        assert ctx["k8s_context_skipped"] is True
 +
 +    def test_env_var_truthy_falls_back_to_hostname(self):
 +        """When GAMBLE_SKIP_K8S=1 and NODE_NAME is unset, hostname fills in."""
 +        with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch:
 +            with patch.dict(os.environ, {"GAMBLE_SKIP_K8S": "true"}, clear=True):
 +                with patch("socket.gethostname", return_value="hostname-shim"):
 +                    ctx = build_node_context()
 +        mock_fetch.assert_not_called()
 +        assert ctx["node_name"] == "hostname-shim"
 +        assert ctx["k8s_context_skipped"] is True
 +
 +    def test_explicit_skip_k8s_arg_skips_k8s_probe(self):
 +        """Case 2: build_node_context(skip_k8s=True) -> kubectl probe bypassed."""
 +        with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch:
 +            with patch.dict(os.environ, {"NODE_NAME": "slurm-host-002"}, clear=True):
 +                ctx = build_node_context(skip_k8s=True)
 +        mock_fetch.assert_not_called()
 +        assert ctx["node_name"] == "slurm-host-002"
 +        assert ctx["k8s_context_skipped"] is True
 +
 +    def test_default_path_preserves_existing_k8s_behavior(self):
 +        """Case 3: both unset -> kubectl probe runs (existing behavior)."""
 +        with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch:
 +            mock_fetch.return_value = {
 +                "gpu.class": "GB300_NVL72",
 +                "zone": "us-central1-a",
 +            }
 +            with patch.dict(os.environ, {"NODE_NAME": "k8s-node-001"}, clear=True):
 +                ctx = build_node_context()
 +        mock_fetch.assert_called_once_with("k8s-node-001")
 +        assert ctx["node_name"] == "k8s-node-001"
 +        assert ctx["gpu.class"] == "GB300_NVL72"
 +        assert "k8s_context_skipped" not in ctx
 +
 +    def test_is_k8s_context_skipped_helper_truth_table(self):
 +        """The _is_k8s_context_skipped helper accepts 1/true/yes (case-insensitive)."""
 +        for truthy in ["1", "true", "yes", "TRUE", "Yes"]:
 +            with patch.dict(os.environ, {"GAMBLE_SKIP_K8S": truthy}, clear=True):
 +                assert _is_k8s_context_skipped(False) is True
 +        for falsy in ["", "0", "false", "no", "anything-else"]:
 +            with patch.dict(os.environ, {"GAMBLE_SKIP_K8S": falsy}, clear=True):
 +                assert _is_k8s_context_skipped(False) is False
 +        with patch.dict(os.environ, {}, clear=True):
 +            assert _is_k8s_context_skipped(False) is False
 +            assert _is_k8s_context_skipped(True) is True
 +
 +    @patch("gamble.runner.check_gpu_throttling", return_value=None)
 +    @patch("gamble.runner.get_gpu_utilization", return_value={"status": True, "data": []})
 +    @patch("gamble.runner.get_cpu_utilization", return_value=0.0)
 +    @patch("gamble.runner.load_plugins")
 +    @patch("gamble.runner._fetch_k8s_node_labels")
 +    def test_match_labels_check_skipped_when_k8s_disabled(
 +        self,
 +        mock_fetch_labels,
 +        mock_load_plugins,
 +        _mock_cpu,
 +        _mock_gpu,
 +        _mock_throttle,
 +        caplog,
 +    ):
 +        """Case 4: a check with match_labels + skip_k8s=True is skipped from test_results.
 +
 +        The whole point of the escape hatch is that we don't try to evaluate
 +        K8s-label filters against an empty dict (which would silently dismiss
 +        every gated test as "label not found"); instead the check is dropped
 +        with a structured log line that operators can audit.
 +        """
 +        plugin_instance = Mock()
 +        plugin_instance.validate.return_value = None
 +        plugin_instance.run.return_value = PluginResult()
 +        plugin_instance.evaluate.return_value = EvaluationResult(
 +            status=EvaluationStatus.PASS, message="ungated check passed"
 +        )
 +        plugin_instance.metrics.return_value = []
 +        plugin_instance.artifacts.return_value = []
 +        plugin_class = Mock(return_value=plugin_instance)
 +        mock_load_plugins.return_value = {"test_plugin": plugin_class}
 +
 +        config = {
 +            "suite_name": "skip_k8s_suite",
 +            "checks": [
 +                {
 +                    "name": "ungated_check",
 +                    "plugin": "test_plugin",
 +                    "params": {},
 +                },
 +                {
 +                    "name": "k8s_label_gated_check",
 +                    "plugin": "test_plugin",
 +                    "params": {},
 +                    "match_labels": {
 +                        "backend.coreweave.cloud/flavor": "infiniband",
 +                    },
 +                },
 +            ],
 +        }
 +        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
 +            yaml.dump(config, f)
 +            config_path = Path(f.name)
 +
 +        try:
 +            with patch.dict(
 +                os.environ,
 +                {"GAMBLE_SKIP_K8S": "1", "NODE_NAME": "slurm-host-003"},
 +                clear=True,
 +            ):
 +                import logging
 +                with caplog.at_level(logging.INFO, logger="gamble.runner"):
 +                    result = run_suite(
 +                        RunOptions(config_path=config_path, no_k8s_context=True)
 +                    )
 +
 +            mock_fetch_labels.assert_not_called()
 +            assert "ungated_check" in result["test_results"]
 +            assert "k8s_label_gated_check" not in result["test_results"]
 +            assert any(
 +                "k8s_context_skipped" in record.getMessage()
 +                and "k8s_label_gated_check" in record.getMessage()
 +                for record in caplog.records
 +            ), (
 +                "Expected a structured 'k8s_context_skipped' INFO log line "
 +                "naming the gated check; got: "
 +                + repr([r.getMessage() for r in caplog.records])
 +            )
 +        finally:
 +            os.unlink(config_path)
 -- 
 2.53.0
No results found