Created
April 30, 2026 11:07
-
-
Save cfregly/3be90c1b72ff13224837f24e7f93cad5 to your computer and use it in GitHub Desktop.
gamble #276 fix: GAMBLE_SKIP_K8S=1 / --no-k8s-context escape hatch (cfregly)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| From 33f13f5c6cf75f8afa1d6a79983ab96d3eeb419d Mon Sep 17 00:00:00 2001 | |
| From: Chris Fregly <cfregly@coreweave.com> | |
| Date: Thu, 30 Apr 2026 04:06:52 -0700 | |
| Subject: [PATCH] runner: GAMBLE_SKIP_K8S=1 / --no-k8s-context escape hatch for | |
| Slurm-side runs (#276) | |
| gamble/runner.py hardcodes a `kubectl get node` probe in build_node_context(). | |
| On Slurm-side runs (where no Kubernetes apiserver is reachable from the | |
| compute node), this fails with the canonical: | |
| [ERROR] [1107] Failed to load Kubernetes context: NODE_NAME environment | |
| variable not set | |
| -- or -- | |
| couldn't get current server API group list: Get "http://localhost:8080/... | |
| and aborts the entire suite. Operators currently work around this by | |
| mounting a kubectl shim that lies about labels. | |
| This change adds a first-class escape hatch: | |
| - GAMBLE_SKIP_K8S=1 (or true/yes, case-insensitive) env var | |
| - --no-k8s-context CLI flag (threads through RunOptions.no_k8s_context) | |
| - build_node_context(skip_k8s=True) Python API | |
| When the escape hatch is active: | |
| - The kubectl probe is bypassed entirely | |
| - node_name is sourced from NODE_NAME, falling back to socket.gethostname() | |
| - context is marked with k8s_context_skipped=True | |
| - Per-check filters that gate on K8s labels (match_labels / | |
| matching_labels / dismatch_labels) are short-circuit-skipped with | |
| a structured INFO log line, so the operator can audit which checks | |
| were dropped instead of having every gated test silently dismissed | |
| as "label not found" | |
| Tests: 6 new cases in tests/test_runner_skip_k8s.py covering env var, | |
| CLI flag, default-preserved behavior, the helper truth table, and | |
| end-to-end run_suite with a match_labels-gated check that gets skipped. | |
| The existing kubectl-probe failure path (BootstrapError -> ERR_GAMBLE_CORE_RUNNER_K8S_UNREACHABLE) | |
| is preserved unchanged for K8s-side runs that actually need K8s context. | |
| validate_suite/render_suite are intentionally not threaded with skip_k8s | |
| in this change to keep the diff small; they fall back to the existing | |
| BootstrapError path if the kubectl probe fails. | |
| Closes #276 | |
| --- | |
| gamble/cli.py | 11 +++ | |
| gamble/models/run_options.py | 1 + | |
| gamble/runner.py | 61 +++++++++++-- | |
| tests/test_runner_skip_k8s.py | 162 ++++++++++++++++++++++++++++++++++ | |
| 4 files changed, 230 insertions(+), 5 deletions(-) | |
| create mode 100644 tests/test_runner_skip_k8s.py | |
| diff --git a/gamble/cli.py b/gamble/cli.py | |
| index 8d3c85b..869b02d 100644 | |
| --- a/gamble/cli.py | |
| +++ b/gamble/cli.py | |
| @@ -164,6 +164,17 @@ def _parse_label_filters(label_filter_args: tuple) -> dict: | |
| @click.option("--patch", is_flag=True, help="Patch Kubernetes node status (Fail on first failure, Pass on success)") | |
| @click.option("--label-filter", "-l", multiple=True, help="Filter tests by label (key=value). Multiple filters use AND logic.") | |
| @click.option("--run-id", type=click.UUID, default=generate_run_id, help="UUID run identifier. A UUIDv7 is auto-generated if not provided.") | |
| +@click.option( | |
| + "--no-k8s-context", | |
| + is_flag=True, | |
| + default=False, | |
| + help=( | |
| + "Skip the kubectl node-context probe. Use on Slurm-side runs where " | |
| + "no Kubernetes apiserver is reachable from the compute node. Tests " | |
| + "that gate on K8s labels (match_labels/matching_labels/dismatch_labels) " | |
| + "are skipped with a structured log line. Equivalent to GAMBLE_SKIP_K8S=1." | |
| + ), | |
| +) | |
| @click.pass_context | |
| def run(ctx, output, context, label_filter, **kwargs): | |
| """Run a test suite from a configuration file. | |
| diff --git a/gamble/models/run_options.py b/gamble/models/run_options.py | |
| index 3c4f1c0..2176ce7 100644 | |
| --- a/gamble/models/run_options.py | |
| +++ b/gamble/models/run_options.py | |
| @@ -29,3 +29,4 @@ class RunOptions: | |
| format: str = "text" | |
| label_filters: Dict[str, str] = field(default_factory=dict) | |
| run_id: UUID = field(default_factory=generate_run_id) | |
| + no_k8s_context: bool = False | |
| diff --git a/gamble/runner.py b/gamble/runner.py | |
| index f36c480..83b8938 100644 | |
| --- a/gamble/runner.py | |
| +++ b/gamble/runner.py | |
| @@ -101,9 +101,30 @@ def load_plugins() -> Dict[str, Any]: | |
| return plugins | |
| +def _is_k8s_context_skipped(skip_k8s: bool) -> bool: | |
| + """Resolve whether the Kubernetes context probe should be skipped. | |
| + | |
| + Honors GAMBLE_SKIP_K8S=1/true/yes (case-insensitive) and the explicit | |
| + ``skip_k8s`` argument (set via ``--no-k8s-context`` / ``RunOptions``). | |
| + """ | |
| + if skip_k8s: | |
| + return True | |
| + return os.environ.get("GAMBLE_SKIP_K8S", "").strip().lower() in {"1", "true", "yes"} | |
| + | |
| + | |
| @trace | |
| -def build_node_context() -> Dict[str, Any]: | |
| - """Build node context information from Kubernetes node labels and environment.""" | |
| +def build_node_context(skip_k8s: bool = False) -> Dict[str, Any]: | |
| + """Build node context information from Kubernetes node labels and environment. | |
| + | |
| + When ``skip_k8s`` is True or ``GAMBLE_SKIP_K8S`` is truthy in the | |
| + environment, the kubectl probe is bypassed entirely. This unblocks | |
| + Slurm-side runs where no Kubernetes apiserver is reachable from the | |
| + compute node. Plugins still receive a ``node_name`` (from ``NODE_NAME`` | |
| + or, as a fallback, the hostname) and any context loaded via | |
| + ``NODE_CONTEXT_FILE``, but no K8s labels are injected. Per-check label | |
| + filters are short-circuit-skipped instead of being evaluated against an | |
| + empty label dict (see the per-check filter block in ``run_suite``). | |
| + """ | |
| # Check if we should use a context file instead of Kubernetes | |
| context_file = os.environ.get("NODE_CONTEXT_FILE") | |
| if context_file: | |
| @@ -112,6 +133,19 @@ def build_node_context() -> Dict[str, Any]: | |
| logger.info("Successfully loaded context from file") | |
| return context | |
| + if _is_k8s_context_skipped(skip_k8s): | |
| + node_name = os.environ.get("NODE_NAME", "").strip() | |
| + if not node_name: | |
| + try: | |
| + import socket | |
| + node_name = socket.gethostname() | |
| + except Exception: | |
| + node_name = "unknown" | |
| + logger.info( | |
| + f"Skipping Kubernetes context probe (GAMBLE_SKIP_K8S=1 / --no-k8s-context); node_name={node_name}" | |
| + ) | |
| + return {"node_name": node_name, "k8s_context_skipped": True} | |
| + | |
| try: | |
| # Get node name from environment variable | |
| node_name = os.environ.get("NODE_NAME", "") | |
| @@ -552,11 +586,14 @@ class BootstrapError(Exception): | |
| super().__init__(result.get("error_message", "bootstrap failed")) | |
| -def _bootstrap(config_path: Path) -> tuple[ValidationSuite, Dict, Dict]: | |
| +def _bootstrap(config_path: Path, skip_k8s: bool = False) -> tuple[ValidationSuite, Dict, Dict]: | |
| """Common setup method for both run_suite() and validate_suite() | |
| Args: | |
| config_path: Path to the YAML configuration file containing test definitions. | |
| + skip_k8s: When True (or GAMBLE_SKIP_K8S is truthy), bypass the | |
| + kubectl probe in build_node_context(). Slurm-side compatibility | |
| + knob -- see #276. | |
| Returns: | |
| Tuple containing config, node_context, and plugins | |
| @@ -568,7 +605,7 @@ def _bootstrap(config_path: Path) -> tuple[ValidationSuite, Dict, Dict]: | |
| """ | |
| # Build execution context first (needed for match_labels evaluation) | |
| try: | |
| - node_context = build_node_context() | |
| + node_context = build_node_context(skip_k8s=skip_k8s) | |
| except RuntimeError as e: | |
| error_code = ErrorCode.ERR_GAMBLE_CORE_RUNNER_K8S_UNREACHABLE | |
| error_message = _format_error_message(error_code, str(e)) | |
| @@ -719,7 +756,7 @@ def run_suite(opts: RunOptions) -> Dict[str, Any]: | |
| setup_logging(opts.verbose) | |
| # Prepare runtime objects | |
| try: | |
| - suite, node_context, plugins = _bootstrap(opts.config_path) | |
| + suite, node_context, plugins = _bootstrap(opts.config_path, skip_k8s=opts.no_k8s_context) | |
| except BootstrapError as e: | |
| if opts.patch: | |
| patch_node( | |
| @@ -804,6 +841,20 @@ def run_suite(opts: RunOptions) -> Dict[str, Any]: | |
| logger.info(f"Skipping disabled test: {check.name}") | |
| continue | |
| + # When the kubectl probe was skipped (Slurm-side runs / #276), | |
| + # checks that gate on K8s labels can't be evaluated. Skip them | |
| + # explicitly with a structured log line so the operator can audit | |
| + # which checks were dropped, instead of evaluating every label | |
| + # filter against an empty dict. | |
| + k8s_skipped = bool(node_context.get("k8s_context_skipped")) | |
| + if k8s_skipped and (check.match_labels or check.matching_labels or check.dismatch_labels): | |
| + logger.info( | |
| + f"Skipping test due to k8s_context_skipped: {check.name} " | |
| + f"(label filters cannot be evaluated without K8s labels; " | |
| + f"GAMBLE_SKIP_K8S=1 / --no-k8s-context)" | |
| + ) | |
| + continue | |
| + | |
| # Check test-level match_labels | |
| if check.match_labels: | |
| if not _matches_labels(node_context, check.match_labels): | |
| diff --git a/tests/test_runner_skip_k8s.py b/tests/test_runner_skip_k8s.py | |
| new file mode 100644 | |
| index 0000000..1393ed2 | |
| --- /dev/null | |
| +++ b/tests/test_runner_skip_k8s.py | |
| @@ -0,0 +1,162 @@ | |
| +"""Tests for the GAMBLE_SKIP_K8S=1 / --no-k8s-context escape hatch (#276). | |
| + | |
| +The escape hatch unblocks Slurm-side runs by bypassing the kubectl node-context | |
| +probe in build_node_context(). Per-check filters that gate on K8s labels | |
| +(match_labels / matching_labels / dismatch_labels) are short-circuit-skipped | |
| +with a structured log line instead of being evaluated against an empty label | |
| +dict. | |
| +""" | |
| + | |
| +import os | |
| +import tempfile | |
| +from pathlib import Path | |
| +from unittest.mock import Mock, patch | |
| + | |
| +import pytest | |
| +import yaml | |
| + | |
| +from gamble.models import EvaluationResult, EvaluationStatus, PluginResult, RunOptions | |
| +from gamble.runner import _is_k8s_context_skipped, build_node_context, run_suite | |
| + | |
| + | |
| +class TestSkipK8sEscapeHatch: | |
| + """Verify GAMBLE_SKIP_K8S=1 + --no-k8s-context bypass the kubectl probe.""" | |
| + | |
| + def test_env_var_truthy_skips_k8s_probe(self): | |
| + """Case 1: GAMBLE_SKIP_K8S=1 -> _fetch_k8s_node_labels is never called.""" | |
| + with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch: | |
| + with patch.dict( | |
| + os.environ, | |
| + {"GAMBLE_SKIP_K8S": "1", "NODE_NAME": "slurm-host-001"}, | |
| + clear=True, | |
| + ): | |
| + ctx = build_node_context() | |
| + mock_fetch.assert_not_called() | |
| + assert ctx["node_name"] == "slurm-host-001" | |
| + assert ctx["k8s_context_skipped"] is True | |
| + | |
| + def test_env_var_truthy_falls_back_to_hostname(self): | |
| + """When GAMBLE_SKIP_K8S=1 and NODE_NAME is unset, hostname fills in.""" | |
| + with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch: | |
| + with patch.dict(os.environ, {"GAMBLE_SKIP_K8S": "true"}, clear=True): | |
| + with patch("socket.gethostname", return_value="hostname-shim"): | |
| + ctx = build_node_context() | |
| + mock_fetch.assert_not_called() | |
| + assert ctx["node_name"] == "hostname-shim" | |
| + assert ctx["k8s_context_skipped"] is True | |
| + | |
| + def test_explicit_skip_k8s_arg_skips_k8s_probe(self): | |
| + """Case 2: build_node_context(skip_k8s=True) -> kubectl probe bypassed.""" | |
| + with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch: | |
| + with patch.dict(os.environ, {"NODE_NAME": "slurm-host-002"}, clear=True): | |
| + ctx = build_node_context(skip_k8s=True) | |
| + mock_fetch.assert_not_called() | |
| + assert ctx["node_name"] == "slurm-host-002" | |
| + assert ctx["k8s_context_skipped"] is True | |
| + | |
| + def test_default_path_preserves_existing_k8s_behavior(self): | |
| + """Case 3: both unset -> kubectl probe runs (existing behavior).""" | |
| + with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch: | |
| + mock_fetch.return_value = { | |
| + "gpu.class": "GB300_NVL72", | |
| + "zone": "us-central1-a", | |
| + } | |
| + with patch.dict(os.environ, {"NODE_NAME": "k8s-node-001"}, clear=True): | |
| + ctx = build_node_context() | |
| + mock_fetch.assert_called_once_with("k8s-node-001") | |
| + assert ctx["node_name"] == "k8s-node-001" | |
| + assert ctx["gpu.class"] == "GB300_NVL72" | |
| + assert "k8s_context_skipped" not in ctx | |
| + | |
| + def test_is_k8s_context_skipped_helper_truth_table(self): | |
| + """The _is_k8s_context_skipped helper accepts 1/true/yes (case-insensitive).""" | |
| + for truthy in ["1", "true", "yes", "TRUE", "Yes"]: | |
| + with patch.dict(os.environ, {"GAMBLE_SKIP_K8S": truthy}, clear=True): | |
| + assert _is_k8s_context_skipped(False) is True | |
| + for falsy in ["", "0", "false", "no", "anything-else"]: | |
| + with patch.dict(os.environ, {"GAMBLE_SKIP_K8S": falsy}, clear=True): | |
| + assert _is_k8s_context_skipped(False) is False | |
| + with patch.dict(os.environ, {}, clear=True): | |
| + assert _is_k8s_context_skipped(False) is False | |
| + assert _is_k8s_context_skipped(True) is True | |
| + | |
| + @patch("gamble.runner.check_gpu_throttling", return_value=None) | |
| + @patch("gamble.runner.get_gpu_utilization", return_value={"status": True, "data": []}) | |
| + @patch("gamble.runner.get_cpu_utilization", return_value=0.0) | |
| + @patch("gamble.runner.load_plugins") | |
| + @patch("gamble.runner._fetch_k8s_node_labels") | |
| + def test_match_labels_check_skipped_when_k8s_disabled( | |
| + self, | |
| + mock_fetch_labels, | |
| + mock_load_plugins, | |
| + _mock_cpu, | |
| + _mock_gpu, | |
| + _mock_throttle, | |
| + caplog, | |
| + ): | |
| + """Case 4: a check with match_labels + skip_k8s=True is skipped from test_results. | |
| + | |
| + The whole point of the escape hatch is that we don't try to evaluate | |
| + K8s-label filters against an empty dict (which would silently dismiss | |
| + every gated test as "label not found"); instead the check is dropped | |
| + with a structured log line that operators can audit. | |
| + """ | |
| + plugin_instance = Mock() | |
| + plugin_instance.validate.return_value = None | |
| + plugin_instance.run.return_value = PluginResult() | |
| + plugin_instance.evaluate.return_value = EvaluationResult( | |
| + status=EvaluationStatus.PASS, message="ungated check passed" | |
| + ) | |
| + plugin_instance.metrics.return_value = [] | |
| + plugin_instance.artifacts.return_value = [] | |
| + plugin_class = Mock(return_value=plugin_instance) | |
| + mock_load_plugins.return_value = {"test_plugin": plugin_class} | |
| + | |
| + config = { | |
| + "suite_name": "skip_k8s_suite", | |
| + "checks": [ | |
| + { | |
| + "name": "ungated_check", | |
| + "plugin": "test_plugin", | |
| + "params": {}, | |
| + }, | |
| + { | |
| + "name": "k8s_label_gated_check", | |
| + "plugin": "test_plugin", | |
| + "params": {}, | |
| + "match_labels": { | |
| + "backend.coreweave.cloud/flavor": "infiniband", | |
| + }, | |
| + }, | |
| + ], | |
| + } | |
| + with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f: | |
| + yaml.dump(config, f) | |
| + config_path = Path(f.name) | |
| + | |
| + try: | |
| + with patch.dict( | |
| + os.environ, | |
| + {"GAMBLE_SKIP_K8S": "1", "NODE_NAME": "slurm-host-003"}, | |
| + clear=True, | |
| + ): | |
| + import logging | |
| + with caplog.at_level(logging.INFO, logger="gamble.runner"): | |
| + result = run_suite( | |
| + RunOptions(config_path=config_path, no_k8s_context=True) | |
| + ) | |
| + | |
| + mock_fetch_labels.assert_not_called() | |
| + assert "ungated_check" in result["test_results"] | |
| + assert "k8s_label_gated_check" not in result["test_results"] | |
| + assert any( | |
| + "k8s_context_skipped" in record.getMessage() | |
| + and "k8s_label_gated_check" in record.getMessage() | |
| + for record in caplog.records | |
| + ), ( | |
| + "Expected a structured 'k8s_context_skipped' INFO log line " | |
| + "naming the gated check; got: " | |
| + + repr([r.getMessage() for r in caplog.records]) | |
| + ) | |
| + finally: | |
| + os.unlink(config_path) | |
| -- | |
| 2.53.0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment