Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save cfregly/3be90c1b72ff13224837f24e7f93cad5 to your computer and use it in GitHub Desktop.

Select an option

Save cfregly/3be90c1b72ff13224837f24e7f93cad5 to your computer and use it in GitHub Desktop.
gamble #276 fix: GAMBLE_SKIP_K8S=1 / --no-k8s-context escape hatch (cfregly)
From 33f13f5c6cf75f8afa1d6a79983ab96d3eeb419d Mon Sep 17 00:00:00 2001
From: Chris Fregly <cfregly@coreweave.com>
Date: Thu, 30 Apr 2026 04:06:52 -0700
Subject: [PATCH] runner: GAMBLE_SKIP_K8S=1 / --no-k8s-context escape hatch for
Slurm-side runs (#276)
gamble/runner.py hardcodes a `kubectl get node` probe in build_node_context().
On Slurm-side runs (where no Kubernetes apiserver is reachable from the
compute node), this fails with the canonical:
[ERROR] [1107] Failed to load Kubernetes context: NODE_NAME environment
variable not set
-- or --
couldn't get current server API group list: Get "http://localhost:8080/...
and aborts the entire suite. Operators currently work around this by
mounting a kubectl shim that lies about labels.
This change adds a first-class escape hatch:
- GAMBLE_SKIP_K8S=1 (or true/yes, case-insensitive) env var
- --no-k8s-context CLI flag (threads through RunOptions.no_k8s_context)
- build_node_context(skip_k8s=True) Python API
When the escape hatch is active:
- The kubectl probe is bypassed entirely
- node_name is sourced from NODE_NAME, falling back to socket.gethostname()
- context is marked with k8s_context_skipped=True
- Per-check filters that gate on K8s labels (match_labels /
matching_labels / dismatch_labels) are short-circuit-skipped with
a structured INFO log line, so the operator can audit which checks
were dropped instead of having every gated test silently dismissed
as "label not found"
Tests: 6 new cases in tests/test_runner_skip_k8s.py covering env var,
CLI flag, default-preserved behavior, the helper truth table, and
end-to-end run_suite with a match_labels-gated check that gets skipped.
The existing kubectl-probe failure path (BootstrapError -> ERR_GAMBLE_CORE_RUNNER_K8S_UNREACHABLE)
is preserved unchanged for K8s-side runs that actually need K8s context.
validate_suite/render_suite are intentionally not threaded with skip_k8s
in this change to keep the diff small; they fall back to the existing
BootstrapError path if the kubectl probe fails.
Closes #276
---
gamble/cli.py | 11 +++
gamble/models/run_options.py | 1 +
gamble/runner.py | 61 +++++++++++--
tests/test_runner_skip_k8s.py | 162 ++++++++++++++++++++++++++++++++++
4 files changed, 230 insertions(+), 5 deletions(-)
create mode 100644 tests/test_runner_skip_k8s.py
diff --git a/gamble/cli.py b/gamble/cli.py
index 8d3c85b..869b02d 100644
--- a/gamble/cli.py
+++ b/gamble/cli.py
@@ -164,6 +164,17 @@ def _parse_label_filters(label_filter_args: tuple) -> dict:
@click.option("--patch", is_flag=True, help="Patch Kubernetes node status (Fail on first failure, Pass on success)")
@click.option("--label-filter", "-l", multiple=True, help="Filter tests by label (key=value). Multiple filters use AND logic.")
@click.option("--run-id", type=click.UUID, default=generate_run_id, help="UUID run identifier. A UUIDv7 is auto-generated if not provided.")
+@click.option(
+ "--no-k8s-context",
+ is_flag=True,
+ default=False,
+ help=(
+ "Skip the kubectl node-context probe. Use on Slurm-side runs where "
+ "no Kubernetes apiserver is reachable from the compute node. Tests "
+ "that gate on K8s labels (match_labels/matching_labels/dismatch_labels) "
+ "are skipped with a structured log line. Equivalent to GAMBLE_SKIP_K8S=1."
+ ),
+)
@click.pass_context
def run(ctx, output, context, label_filter, **kwargs):
"""Run a test suite from a configuration file.
diff --git a/gamble/models/run_options.py b/gamble/models/run_options.py
index 3c4f1c0..2176ce7 100644
--- a/gamble/models/run_options.py
+++ b/gamble/models/run_options.py
@@ -29,3 +29,4 @@ class RunOptions:
format: str = "text"
label_filters: Dict[str, str] = field(default_factory=dict)
run_id: UUID = field(default_factory=generate_run_id)
+ no_k8s_context: bool = False
diff --git a/gamble/runner.py b/gamble/runner.py
index f36c480..83b8938 100644
--- a/gamble/runner.py
+++ b/gamble/runner.py
@@ -101,9 +101,30 @@ def load_plugins() -> Dict[str, Any]:
return plugins
+def _is_k8s_context_skipped(skip_k8s: bool) -> bool:
+ """Resolve whether the Kubernetes context probe should be skipped.
+
+ Honors GAMBLE_SKIP_K8S=1/true/yes (case-insensitive) and the explicit
+ ``skip_k8s`` argument (set via ``--no-k8s-context`` / ``RunOptions``).
+ """
+ if skip_k8s:
+ return True
+ return os.environ.get("GAMBLE_SKIP_K8S", "").strip().lower() in {"1", "true", "yes"}
+
+
@trace
-def build_node_context() -> Dict[str, Any]:
- """Build node context information from Kubernetes node labels and environment."""
+def build_node_context(skip_k8s: bool = False) -> Dict[str, Any]:
+ """Build node context information from Kubernetes node labels and environment.
+
+ When ``skip_k8s`` is True or ``GAMBLE_SKIP_K8S`` is truthy in the
+ environment, the kubectl probe is bypassed entirely. This unblocks
+ Slurm-side runs where no Kubernetes apiserver is reachable from the
+ compute node. Plugins still receive a ``node_name`` (from ``NODE_NAME``
+ or, as a fallback, the hostname) and any context loaded via
+ ``NODE_CONTEXT_FILE``, but no K8s labels are injected. Per-check label
+ filters are short-circuit-skipped instead of being evaluated against an
+ empty label dict (see the per-check filter block in ``run_suite``).
+ """
# Check if we should use a context file instead of Kubernetes
context_file = os.environ.get("NODE_CONTEXT_FILE")
if context_file:
@@ -112,6 +133,19 @@ def build_node_context() -> Dict[str, Any]:
logger.info("Successfully loaded context from file")
return context
+ if _is_k8s_context_skipped(skip_k8s):
+ node_name = os.environ.get("NODE_NAME", "").strip()
+ if not node_name:
+ try:
+ import socket
+ node_name = socket.gethostname()
+ except Exception:
+ node_name = "unknown"
+ logger.info(
+ f"Skipping Kubernetes context probe (GAMBLE_SKIP_K8S=1 / --no-k8s-context); node_name={node_name}"
+ )
+ return {"node_name": node_name, "k8s_context_skipped": True}
+
try:
# Get node name from environment variable
node_name = os.environ.get("NODE_NAME", "")
@@ -552,11 +586,14 @@ class BootstrapError(Exception):
super().__init__(result.get("error_message", "bootstrap failed"))
-def _bootstrap(config_path: Path) -> tuple[ValidationSuite, Dict, Dict]:
+def _bootstrap(config_path: Path, skip_k8s: bool = False) -> tuple[ValidationSuite, Dict, Dict]:
"""Common setup method for both run_suite() and validate_suite()
Args:
config_path: Path to the YAML configuration file containing test definitions.
+ skip_k8s: When True (or GAMBLE_SKIP_K8S is truthy), bypass the
+ kubectl probe in build_node_context(). Slurm-side compatibility
+ knob -- see #276.
Returns:
Tuple containing config, node_context, and plugins
@@ -568,7 +605,7 @@ def _bootstrap(config_path: Path) -> tuple[ValidationSuite, Dict, Dict]:
"""
# Build execution context first (needed for match_labels evaluation)
try:
- node_context = build_node_context()
+ node_context = build_node_context(skip_k8s=skip_k8s)
except RuntimeError as e:
error_code = ErrorCode.ERR_GAMBLE_CORE_RUNNER_K8S_UNREACHABLE
error_message = _format_error_message(error_code, str(e))
@@ -719,7 +756,7 @@ def run_suite(opts: RunOptions) -> Dict[str, Any]:
setup_logging(opts.verbose)
# Prepare runtime objects
try:
- suite, node_context, plugins = _bootstrap(opts.config_path)
+ suite, node_context, plugins = _bootstrap(opts.config_path, skip_k8s=opts.no_k8s_context)
except BootstrapError as e:
if opts.patch:
patch_node(
@@ -804,6 +841,20 @@ def run_suite(opts: RunOptions) -> Dict[str, Any]:
logger.info(f"Skipping disabled test: {check.name}")
continue
+ # When the kubectl probe was skipped (Slurm-side runs / #276),
+ # checks that gate on K8s labels can't be evaluated. Skip them
+ # explicitly with a structured log line so the operator can audit
+ # which checks were dropped, instead of evaluating every label
+ # filter against an empty dict.
+ k8s_skipped = bool(node_context.get("k8s_context_skipped"))
+ if k8s_skipped and (check.match_labels or check.matching_labels or check.dismatch_labels):
+ logger.info(
+ f"Skipping test due to k8s_context_skipped: {check.name} "
+ f"(label filters cannot be evaluated without K8s labels; "
+ f"GAMBLE_SKIP_K8S=1 / --no-k8s-context)"
+ )
+ continue
+
# Check test-level match_labels
if check.match_labels:
if not _matches_labels(node_context, check.match_labels):
diff --git a/tests/test_runner_skip_k8s.py b/tests/test_runner_skip_k8s.py
new file mode 100644
index 0000000..1393ed2
--- /dev/null
+++ b/tests/test_runner_skip_k8s.py
@@ -0,0 +1,162 @@
+"""Tests for the GAMBLE_SKIP_K8S=1 / --no-k8s-context escape hatch (#276).
+
+The escape hatch unblocks Slurm-side runs by bypassing the kubectl node-context
+probe in build_node_context(). Per-check filters that gate on K8s labels
+(match_labels / matching_labels / dismatch_labels) are short-circuit-skipped
+with a structured log line instead of being evaluated against an empty label
+dict.
+"""
+
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import Mock, patch
+
+import pytest
+import yaml
+
+from gamble.models import EvaluationResult, EvaluationStatus, PluginResult, RunOptions
+from gamble.runner import _is_k8s_context_skipped, build_node_context, run_suite
+
+
+class TestSkipK8sEscapeHatch:
+ """Verify GAMBLE_SKIP_K8S=1 + --no-k8s-context bypass the kubectl probe."""
+
+ def test_env_var_truthy_skips_k8s_probe(self):
+ """Case 1: GAMBLE_SKIP_K8S=1 -> _fetch_k8s_node_labels is never called."""
+ with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch:
+ with patch.dict(
+ os.environ,
+ {"GAMBLE_SKIP_K8S": "1", "NODE_NAME": "slurm-host-001"},
+ clear=True,
+ ):
+ ctx = build_node_context()
+ mock_fetch.assert_not_called()
+ assert ctx["node_name"] == "slurm-host-001"
+ assert ctx["k8s_context_skipped"] is True
+
+ def test_env_var_truthy_falls_back_to_hostname(self):
+ """When GAMBLE_SKIP_K8S=1 and NODE_NAME is unset, hostname fills in."""
+ with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch:
+ with patch.dict(os.environ, {"GAMBLE_SKIP_K8S": "true"}, clear=True):
+ with patch("socket.gethostname", return_value="hostname-shim"):
+ ctx = build_node_context()
+ mock_fetch.assert_not_called()
+ assert ctx["node_name"] == "hostname-shim"
+ assert ctx["k8s_context_skipped"] is True
+
+ def test_explicit_skip_k8s_arg_skips_k8s_probe(self):
+ """Case 2: build_node_context(skip_k8s=True) -> kubectl probe bypassed."""
+ with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch:
+ with patch.dict(os.environ, {"NODE_NAME": "slurm-host-002"}, clear=True):
+ ctx = build_node_context(skip_k8s=True)
+ mock_fetch.assert_not_called()
+ assert ctx["node_name"] == "slurm-host-002"
+ assert ctx["k8s_context_skipped"] is True
+
+ def test_default_path_preserves_existing_k8s_behavior(self):
+ """Case 3: both unset -> kubectl probe runs (existing behavior)."""
+ with patch("gamble.runner._fetch_k8s_node_labels") as mock_fetch:
+ mock_fetch.return_value = {
+ "gpu.class": "GB300_NVL72",
+ "zone": "us-central1-a",
+ }
+ with patch.dict(os.environ, {"NODE_NAME": "k8s-node-001"}, clear=True):
+ ctx = build_node_context()
+ mock_fetch.assert_called_once_with("k8s-node-001")
+ assert ctx["node_name"] == "k8s-node-001"
+ assert ctx["gpu.class"] == "GB300_NVL72"
+ assert "k8s_context_skipped" not in ctx
+
+ def test_is_k8s_context_skipped_helper_truth_table(self):
+ """The _is_k8s_context_skipped helper accepts 1/true/yes (case-insensitive)."""
+ for truthy in ["1", "true", "yes", "TRUE", "Yes"]:
+ with patch.dict(os.environ, {"GAMBLE_SKIP_K8S": truthy}, clear=True):
+ assert _is_k8s_context_skipped(False) is True
+ for falsy in ["", "0", "false", "no", "anything-else"]:
+ with patch.dict(os.environ, {"GAMBLE_SKIP_K8S": falsy}, clear=True):
+ assert _is_k8s_context_skipped(False) is False
+ with patch.dict(os.environ, {}, clear=True):
+ assert _is_k8s_context_skipped(False) is False
+ assert _is_k8s_context_skipped(True) is True
+
+ @patch("gamble.runner.check_gpu_throttling", return_value=None)
+ @patch("gamble.runner.get_gpu_utilization", return_value={"status": True, "data": []})
+ @patch("gamble.runner.get_cpu_utilization", return_value=0.0)
+ @patch("gamble.runner.load_plugins")
+ @patch("gamble.runner._fetch_k8s_node_labels")
+ def test_match_labels_check_skipped_when_k8s_disabled(
+ self,
+ mock_fetch_labels,
+ mock_load_plugins,
+ _mock_cpu,
+ _mock_gpu,
+ _mock_throttle,
+ caplog,
+ ):
+ """Case 4: a check with match_labels + skip_k8s=True is skipped from test_results.
+
+ The whole point of the escape hatch is that we don't try to evaluate
+ K8s-label filters against an empty dict (which would silently dismiss
+ every gated test as "label not found"); instead the check is dropped
+ with a structured log line that operators can audit.
+ """
+ plugin_instance = Mock()
+ plugin_instance.validate.return_value = None
+ plugin_instance.run.return_value = PluginResult()
+ plugin_instance.evaluate.return_value = EvaluationResult(
+ status=EvaluationStatus.PASS, message="ungated check passed"
+ )
+ plugin_instance.metrics.return_value = []
+ plugin_instance.artifacts.return_value = []
+ plugin_class = Mock(return_value=plugin_instance)
+ mock_load_plugins.return_value = {"test_plugin": plugin_class}
+
+ config = {
+ "suite_name": "skip_k8s_suite",
+ "checks": [
+ {
+ "name": "ungated_check",
+ "plugin": "test_plugin",
+ "params": {},
+ },
+ {
+ "name": "k8s_label_gated_check",
+ "plugin": "test_plugin",
+ "params": {},
+ "match_labels": {
+ "backend.coreweave.cloud/flavor": "infiniband",
+ },
+ },
+ ],
+ }
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+ yaml.dump(config, f)
+ config_path = Path(f.name)
+
+ try:
+ with patch.dict(
+ os.environ,
+ {"GAMBLE_SKIP_K8S": "1", "NODE_NAME": "slurm-host-003"},
+ clear=True,
+ ):
+ import logging
+ with caplog.at_level(logging.INFO, logger="gamble.runner"):
+ result = run_suite(
+ RunOptions(config_path=config_path, no_k8s_context=True)
+ )
+
+ mock_fetch_labels.assert_not_called()
+ assert "ungated_check" in result["test_results"]
+ assert "k8s_label_gated_check" not in result["test_results"]
+ assert any(
+ "k8s_context_skipped" in record.getMessage()
+ and "k8s_label_gated_check" in record.getMessage()
+ for record in caplog.records
+ ), (
+ "Expected a structured 'k8s_context_skipped' INFO log line "
+ "naming the gated check; got: "
+ + repr([r.getMessage() for r in caplog.records])
+ )
+ finally:
+ os.unlink(config_path)
--
2.53.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment