opsec-ee · March 22, 2026 02:26 · opsec-ee · Feb 19, 2026 · opsec-ee · Feb 19, 2026
diff --git a/polymorphic-2.c b/polymorphic-2.c
 /*
 * @file    polymorphic-2.c
 * @version 2.2.0
 * @author  H. Overman <opsec.ee@pm.me>
 * @brief   JSON Polymorphic Dispatch — Graph Coloring Edition
 *          SoA layout, BTB dealiasing, group-direct loops,
 *          graph coloring reorder, 4-state gate system.
 *
 * Copyright (c) 2025 H. Overman <opsec.ee@pm.me>
 * CC-BY-NC 4.0 non-commercial license
 *
 * Euman = ∫(optimization/time) △ performance
 *
 *
 * =================================================================
 * CHANGELOG
 * =================================================================
 *
 * v2.2.0  PPO-3 v3.0 compliance pass
 *   - FRONT/LEAD/REAR contracts on every function (replaces
 *     type-arrow {{0 [ ... ] 1}} notation — PPO-3 §CONTRACTS)
 *   - IFP invariant block I1..I7 in causal order (§REINDEX)
 *   - IFP edge graph in file header (§DISCOVER)
 *   - CLOCK_MONOTONIC benchmarking replaces __rdtsc
 *     (§BENCHMARK CLOCK DISCIPLINE)
 *   - All constants derived with provenance (§THRESHOLDS)
 *   - static_assert on every compile-time structural invariant
 *     (§STATIC VERIFICATION)
 *   - Self-tests at startup with concrete inputs/outputs
 *     (§SELF-TESTS AT STARTUP)
 *   - Emergent statistics validated against theory
 *     (§EMERGENT STATISTICS)
 *   - p[] storage justified per §EMERGENCE vs IMPOSITION
 *   - Build flags documented per §BUILD HYGIENE
 *   - Triple check sequence applied (§TRIPLE CHECK)
 *   - 144,000 ratio exception documented (§THRESHOLDS)
 *   - Phase ordering documented and verified (§PHASE ORDERING)
 *
 * v6.5.1  NULL-001 UAF fix — cached store->count into local
 *         before free-path branch (CERT MEM01-C, CWE-416)
 *
 * v6.5.0  Graph coloring reorder, SoA layout, BTB dealiasing,
 *         group-direct loops, madvise(MADV_SEQUENTIAL),
 *         branchless parse_number, FMA-style P3 accumulation,
 *         depth-1 object boundary fix, software prefetch,
 *         __builtin_expect on hot paths, 4-state gate system.
 *
 *
 * =================================================================
 * IFP INVARIANTS (causal order)
 * =================================================================
 *
 * I1  MMAP_STREAM     mmap'd JSON byte stream.
 *                     PROT_READ | MAP_PRIVATE. Immutable source.
 *                     madvise(MADV_SEQUENTIAL) for kernel readahead.
 *                     Annihilated by munmap after I3 is populated.
 *
 * I2  OBJECT_SPANS    Depth-1 '{' ... '}' boundary pointer pairs.
 *                     Single-pass depth tracker, string-escape aware.
 *                     Fallback: depth-1 if no depth-2 objects found.
 *                     Depends: I1
 *
 * I3  SOA_STORE       Structure-of-Arrays: v[count * 7], n[count],
 *                     p[count].  Values flat-packed for cache-line
 *                     streaming.  Counts per object. Pattern per
 *                     object (see I4 justification).
 *                     Depends: I2
 *
 * I4  PATTERN_CLASS   p[i] = (n[i] - 1) & 0x3
 *                     Computable from I3.n[].  Stored — not computed
 *                     on access — because:
 *                       (a) graph coloring reorder (I6) permutes
 *                           all three arrays; p[] must be materialized
 *                           for the sort key;
 *                       (b) dispatch inner loop indexes p[idx] on
 *                           every iteration; recomputing (n[idx]-1)&3
 *                           adds a data dependency to the critical path.
 *                     Four patterns: P0(single), P1(pair),
 *                     P2(triple), P3(multi).
 *                     Bit mask & 0x3 guarantees mutual exclusivity
 *                     BY CONSTRUCTION — not by convention.
 *                     Depends: I3
 *
 * I5  TRANS_MATRIX    4x4 transition count matrix from original order.
 *                     trans[a][b] = count of adjacent pairs where
 *                     p[i-1]==a and p[i]==b.
 *                     Depends: I4
 *
 * I6  COLOR_ORDER     Permutation of {0,1,2,3} minimizing total
 *                     boundary misprediction cost.  Heap's algorithm
 *                     (B.R. Heap, "Permutations by Interchanges",
 *                     The Computer Journal 6(3), 1963, pp. 293-4)
 *                     enumerates all 24 permutations.  Cost function:
 *                     sum of bidirectional transition counts at each
 *                     adjacent group boundary in the permutation.
 *                     Depends: I5
 *
 * I7  GROUP_STORE     SoA arrays reordered by pattern group per I6.
 *                     Contiguous pattern blocks: intra-group branch
 *                     is direct (backward jump, 100% predictable).
 *                     Misprediction occurs ONLY at K group boundaries
 *                     instead of N iterations.
 *                     Depends: I3, I6
 *
 *
 * =================================================================
 * IFP EDGE GRAPH
 * =================================================================
 *
 *   I1 ───> I2 ───> I3 ───> I4 ───> I5 ───> I6
 *                    |                        |
 *                    +────────────────────────> I7
 *
 * Phase order: PARSE -> CLASSIFY -> MEASURE -> COLOR -> REORDER
 *                                                         |
 *                                                         v
 *                                                      DISPATCH
 *
 * Barrier: I7 must complete before any dispatch loop begins.
 * Seal:    I1 annihilated (munmap) after I3 populated.
 *          I2 annihilated (free) after I3 populated.
 *          Phase order mirrors IFP edge graph.
 *
 *
 * =================================================================
 * TIMING (8-run, CLOCK_MONOTONIC, fill from target hardware)
 * =================================================================
 *
 *                  Group-Direct (ns/op)    Interleaved (ns/op)
 *  Run 1:          5.31                    4.12
 *  Run 2:          5.15                    4.19
 *  Run 3:          5.98                    4.17
 *  Run 4:          5.01                    4.13
 *  Run 5:          5.23                    4.12
 *  Run 6:          5.04                    4.15
 *  Run 7:          4.79                    4.13
 *  Run 8:          5.00                    4.18
 *  Min:            4.79                    4.12
 *  Max:            5.98                    4.19
 *  Mean:           5.19                    4.15
 *  Spread:         1.19                    0.07
 *
 *  Hardware: Arch Linux, gcc (GCC) 14.x, -O3 -march=native -flto
 *  Dataset:  mixed.json, 16 objects, 10M iterations
 *
 *  Note: interleaved spread 0.07 ns vs group-direct 1.19 ns.
 *  Group-direct's iteration distribution (integer division per
 *  group) creates uneven loop trip counts susceptible to
 *  scheduler jitter.  Interleaved's single uniform loop is
 *  effectively immune.  The BTB prediction advantage of
 *  group-direct (direct branch within groups) is negated
 *  on this 16-object dataset — the working set fits entirely
 *  in L1, so memory latency hiding (the primary benefit of
 *  group-direct at scale) is not exercised.
 *
 * Fill procedure:
 *   1. ASan clean first (§BUILD HYGIENE).
 *   2. Build with release flags (see below).
 *   3. Pin to isolated core: taskset -c 3 ./polymorphic-2 mixed.json
 *   4. Run 8 times. Record ns/dispatch from CLOCK_MONOTONIC output.
 *   5. Compute Min/Max/Mean/Spread. Fill above.
 *
 *
 * =================================================================
 * ACKNOWLEDGMENTS
 * =================================================================
 *
 * - B.R. Heap, "Permutations by Interchanges", The Computer
 *   Journal 6(3), 1963 — Heap's algorithm for permutation
 *   enumeration in graph_color_reorder().
 * - Intel 64 and IA-32 Architectures Software Developer's Manual,
 *   Vol. 3A, §11.5.3 — 64-byte cache line size on x86-64.
 * - CERT MEM01-C — "Do not use freed memory."  Applied in main()
 *   UAF fix (store=NULL poison after objstore_free).
 * - CWE-416 — Use After Free.  Remediated in v6.5.1.
 * - NIST 800-53 SI-16 — Memory Protection.
 *
 *
 * =================================================================
 * BUILD FLAGS
 * =================================================================
 *
 * This codebase uses GNU C extensions: __builtin_prefetch,
 * __builtin_expect, __asm__ volatile (x86-64 "x" constraint),
 * computed goto (Labels as Values), aligned_alloc.
 * ISO C23 (-std=c2x) with -Wpedantic rejects these.
 * Use -std=gnu2x to enable GNU extensions.
 *
 * Release:
 *   gcc -std=gnu2x -O3 -march=native -flto -funroll-loops \
 *       -DNDEBUG -Wall -Wextra -Wpedantic \
 *       -o polymorphic-2 polymorphic-2.c -lrt
 *
 *   Computed-goto pedantic warnings suppressed locally via
 *   #pragma GCC diagnostic in dispatch_interleaved().
 *   The pragma documents the exception at the exact site.
 *   -lrt: required for clock_gettime on older glibc.
 *
 * ASan:
 *   gcc -std=gnu2x -fsanitize=address,undefined -O1 -g \
 *       -Wall -Wextra -Wpedantic \
 *       -o polymorphic-2_asan polymorphic-2.c -lrt
 *
 * ASan clean before any performance measurement.
 * Measuring broken code is decimal arithmetic on a hex problem.
 *
 * Clock skew after file transfer: find . | xargs touch
 *
 *
 * =================================================================
 * 144,000 RATIO EXCEPTION
 * =================================================================
 *
 * This program parses JSON numbers (IEEE 754 by JSON spec, RFC 8259)
 * and executes synthetic arithmetic workloads to benchmark dispatch
 * overhead.  The benchmark constants (SCALE_P0, DECAY_P3, STEP_X)
 * are design parameters for the synthetic workload — precision is
 * not the invariant under test.  Dispatch pattern prediction is.
 *
 * Ratio arithmetic is not applied to the synthetic workload.
 * The dispatch overhead measurement is independent of value
 * precision.  The constants are documented with their exact
 * IEEE 754 representation where inexact (DECAY_P3 = 0.9).
 *
 * Timing arithmetic uses integer nanoseconds from CLOCK_MONOTONIC.
 * No IEEE 754 in the measurement path.
 *
 * =================================================================
 */

 #define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
 #include <stdbool.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <time.h>
 #include <math.h>

 /*
 ==================================================================
 * STRUCTURAL CONSTANTS — every constant derived, not asserted
 ==================================================================
 */

 /*
 * VALUES_PER_OBJ = 7
 *
 * Maximum numeric values extracted per JSON object.
 * Derivation: 7 doubles * 8 bytes = 56 bytes.
 * 56 < 64 (x86-64 L1 cache line, Intel SDM Vol. 3A §11.5.3).
 * One object's values fit in a single cache line with 8 bytes
 * padding.  Increasing to 8 would exactly fill the line but
 * leave no room for prefetch metadata.  7 is the structural
 * maximum for single-line streaming.
 *
 * Spot-check:
 *   7 * sizeof(double) = 7 * 8 = 56   (< 64)  correct
 *   8 * sizeof(double) = 8 * 8 = 64   (== 64)  no headroom
 */
 #define VALUES_PER_OBJ 7

 /*
 * CACHE_LINE_BYTES = 64
 *
 * x86-64 L1 data cache line size.
 * Source: Intel SDM Vol. 3A §11.5.3, "Cache Line Size = 64 Bytes."
 * Also: AMD64 Architecture Programmer's Manual Vol. 2, §7.6.1.
 * Used for aligned_alloc alignment of SoA value array.
 */
 #define CACHE_LINE_BYTES 64

 /*
 * PATTERN_COUNT = 4
 *
 * Number of dispatch patterns.  Derivation:
 *   p[i] = (n[i] - 1) & 0x3
 *   0x3 = 0b11 => 4 possible values: {0, 1, 2, 3}
 *   Bit mask guarantees exactly 4 patterns by construction.
 *   Any n >= 1 maps to one of {P0, P1, P2, P3}.
 *
 * Spot-check:
 *   n=1: (1-1) & 3 = 0  -> P0   correct
 *   n=2: (2-1) & 3 = 1  -> P1   correct
 *   n=4: (4-1) & 3 = 3  -> P3   correct
 *   n=5: (5-1) & 3 = 0  -> P0   wraps, correct by mask
 */
 #define PATTERN_COUNT 4

 /*
 * PATTERN_MASK = 0x3
 *
 * Bit mask for pattern classification.
 * PATTERN_MASK = PATTERN_COUNT - 1.
 * Works because PATTERN_COUNT is a power of 2.
 *   4 = 2^2, 4 - 1 = 3 = 0b11
 */
 #define PATTERN_MASK 0x3

 /*
 * PREFETCH_AHEAD = 8
 *
 * Software prefetch distance in objects.
 * Derivation:
 *   Each object's values: VALUES_PER_OBJ * 8 = 56 bytes.
 *   Prefetch distance: 8 * 56 = 448 bytes ~ 7 cache lines.
 *   Typical L2 miss latency: ~200 cycles (Skylake).
 *   Typical dispatch iteration: ~10-25 cycles.
 *   8 iterations * ~15 cycles/iter = ~120 cycles lookahead.
 *   This covers most of the L2 miss penalty.
 *   Tunable: profile on target hardware and adjust.
 *
 * Spot-check:
 *   8 * 56 = 448 bytes                       correct
 *   448 / 64 = 7 cache lines                 correct
 *   7 lines at 64B/line = 448B               round-trip check
 */
 #define PREFETCH_AHEAD 8

 /*
 * SCALE_P0 = 1000.0
 *
 * Synthetic workload scaling factor for P0 (single-value) pattern.
 * Design parameter: scales the single extracted value to a range
 * comparable to multi-value patterns for benchmark fairness.
 * 1000.0 is exact in IEEE 754 (integer, fits in 52-bit mantissa).
 * No precision loss.
 */
 #define SCALE_P0 1000.0

 /*
 * DECAY_P3 = 0.9
 *
 * Synthetic workload decay factor for P3 (multi-value) chain.
 * Design parameter: models diminishing contribution of successive
 * values in a multiply-accumulate chain.
 *
 * IEEE 754 representation: INEXACT.
 *   0.9 in double: 0.90000000000000002220446049250313080847...
 *   Exact ratio: 9/10 = 129600/144000
 *   The inexactness does not affect benchmark validity because
 *   dispatch overhead — not arithmetic precision — is measured.
 */
 #define DECAY_P3 0.9

 /*
 * STEP_X = 1.5
 *
 * Iteration step factor: x = iteration * STEP_X.
 * Design parameter: generates a monotonically increasing sequence
 * of work values.  1.5 is exact in IEEE 754 (1 + 1/2, representable
 * as 0x3FF8000000000000).  No precision loss.
 */
 #define STEP_X 1.5

 /*
 * INITIAL_SPAN_CAP = 1024
 *
 * Initial allocation capacity for object span array.
 * Power of 2 for efficient realloc doubling.
 * 1024 * sizeof(Span) = 1024 * 16 = 16384 bytes = 256 cache lines.
 * Fits comfortably in L2 (typical 256KB+).
 */
 #define INITIAL_SPAN_CAP 1024

 /*
 * WARMUP_ITERS = 10000
 *
 * Cache warmup iteration count.  Walks the value array once
 * via prefetch to pull data into L1/L2 before timed dispatch.
 * 10000 iterations at 56 bytes/object = 560KB touched.
 * Sufficient to warm L2 for datasets up to ~500K objects.
 */
 #define WARMUP_ITERS 10000

 /*
 * HEAP_PERM_COUNT = 24
 *
 * Total permutations of PATTERN_COUNT elements.
 * 4! = 24.  Heap's algorithm enumerates exactly this many.
 * Used as a sanity bound — not in the code path, but documents
 * why full enumeration is feasible for K=4.
 *
 * Derivation (Python):
 *   import math; math.factorial(4)  # 24
 */
 #define HEAP_PERM_COUNT 24


 /*
 ==================================================================
 * STATIC VERIFICATION — compile-time structural invariants
 *
 * If it can be checked at compile time, check it at compile time.
 * A runtime check on a structural invariant is a notation failure.
 ==================================================================
 */
 static_assert(sizeof(double) == 8,
 	"Type-punning in parse_number requires 8-byte double (IEEE 754)");

 static_assert(VALUES_PER_OBJ == 7,
 	"Values-per-object must be 7 for single-cache-line streaming");

 static_assert(VALUES_PER_OBJ * sizeof(double) < CACHE_LINE_BYTES,
 	"One object's values must fit within a single cache line");

 static_assert(PATTERN_COUNT == 4,
 	"Dispatch table has exactly 4 entries");

 static_assert((PATTERN_COUNT & (PATTERN_COUNT - 1)) == 0,
 	"PATTERN_COUNT must be power of 2 for bit-mask classification");

 static_assert(PATTERN_MASK == (PATTERN_COUNT - 1),
 	"PATTERN_MASK must equal PATTERN_COUNT - 1");

 static_assert(PREFETCH_AHEAD > 0,
 	"Prefetch distance must be positive");

 static_assert(INITIAL_SPAN_CAP > 0
 	&& (INITIAL_SPAN_CAP & (INITIAL_SPAN_CAP - 1)) == 0,
 	"Initial span capacity must be a positive power of 2");


 /*
 ==================================================================
 * GATE SYSTEM — 4-state logic (Z/X/0/1)
 *
 * Z = high-impedance (gate not applicable, skip)
 * X = unknown        (insufficient info, fail closed)
 * 0 = deny           (attack/fault detected, block)
 * 1 = allow          (verified safe, proceed)
 *
 * Gate states follow from the pivot — they do not precede it.
 * Z/X fire at the structure, not at the symptom.
 ==================================================================
 */
 typedef enum {
 	GATE_Z = 0,
 	GATE_X = 1,
 	GATE_0 = 2,
 	GATE_1 = 3
 } gate_state_t;

 typedef struct {
 	void        *value;
 	double       dvalue;
 	gate_state_t state;
 	const char  *reason;
 } GateResult;


 /*
 ==================================================================
 * Gate check macros
 *
 * GATE_NULL_CHECK:   ptr null -> gate-0, goto label
 * GATE_ALLOC_CHECK:  allocation null -> gate-0, goto label
 * GATE_VALIDATE_STR: C-string null -> gate-0, goto label
 *
 * These are runtime contracts (can fail with correct code under
 * load).  They use explicit conditional + goto, not assert().
 * assert() is dead in release builds (-DNDEBUG).
 * These checks survive release.  (PPO-3 §ASSERT IS DEAD)
 ==================================================================
 */
 #define GATE_NULL_CHECK(ptr, label)                                      \
 	do {                                                             \
 		if (__builtin_expect((ptr) == NULL, 0)) {                \
 			fprintf(stderr, "[GATE-0] NULL-001 %s:%d: "      \
 				"null check failed on '%s'\n",            \
 				__func__, __LINE__, #ptr);                \
 			goto label;                                      \
 		}                                                        \
 	} while (0)

 #define GATE_ALLOC_CHECK(ptr, label)                                     \
 	do {                                                             \
 		if (__builtin_expect((ptr) == NULL, 0)) {                \
 			fprintf(stderr, "[GATE-0] NULL-002 %s:%d: "      \
 				"allocation failed for '%s'\n",           \
 				__func__, __LINE__, #ptr);                \
 			goto label;                                      \
 		}                                                        \
 	} while (0)

 #define GATE_VALIDATE_STR(ptr, label)                                    \
 	do {                                                             \
 		if (__builtin_expect((ptr) == NULL, 0)) {                \
 			fprintf(stderr, "[GATE-0] VALIDATE-001 %s:%d: "  \
 				"raw C-string param '%s' is null\n",      \
 				__func__, __LINE__, #ptr);                \
 			goto label;                                      \
 		}                                                        \
 	} while (0)

 #define GATE_RESULT_OK(val)                                              \
 	(GateResult){ .value = (val), .dvalue = 0.0,                     \
 		      .state = GATE_1, .reason = "OK" }

 #define GATE_RESULT_OK_D(d)                                              \
 	(GateResult){ .value = NULL, .dvalue = (d),                      \
 		      .state = GATE_1, .reason = "OK" }

 #define GATE_RESULT_FAIL(msg)                                            \
 	(GateResult){ .value = NULL, .dvalue = 0.0,                      \
 		      .state = GATE_0, .reason = (msg) }


 /*
 ==================================================================
 * OPT-402: SoA (Structure of Arrays) layout
 *
 * Separate contiguous arrays for values, counts, patterns,
 * and group metadata.  Values are VALUES_PER_OBJ-wide per object,
 * flat-packed for cache-line streaming.  Counts and patterns are
 * uint8_t arrays — fit in cache easily.
 *
 * group_start[4]: byte offset where each pattern group begins
 * group_count[4]: object count per pattern group
 * group_order[4]: optimal traversal order from graph coloring (I6)
 ==================================================================
 */
 typedef struct {
 	double   *v;
 	uint8_t  *n;
 	uint8_t  *p;
 	size_t    count;
 	size_t    group_start[PATTERN_COUNT];
 	size_t    group_count[PATTERN_COUNT];
 	uint8_t   group_order[PATTERN_COUNT];
 } ObjStore;

 /*
 * Compile-time check on group array sizes.
 * Must match PATTERN_COUNT to avoid silent buffer overrun
 * if PATTERN_COUNT is ever changed.
 */
 static_assert(sizeof(((ObjStore *)0)->group_start) ==
 	PATTERN_COUNT * sizeof(size_t),
 	"group_start array size must match PATTERN_COUNT");

 static_assert(sizeof(((ObjStore *)0)->group_order) ==
 	PATTERN_COUNT * sizeof(uint8_t),
 	"group_order array size must match PATTERN_COUNT");


 /*
 ==================================================================
 * Object span — pointer pair marking one JSON object boundary
 ==================================================================
 */
 typedef struct {
 	const char *start;
 	const char *end;
 } Span;

 static_assert(sizeof(Span) == 2 * sizeof(const char *),
 	"Span must be exactly two pointers (no padding)");


 /*
 ==================================================================
 * Timing helper — integer nanosecond delta from CLOCK_MONOTONIC
 *
 * FRONT:  Two struct timespec from clock_gettime(CLOCK_MONOTONIC)
 *         t0 captured before workload, t1 captured after.
 * LEAD:   Integer subtraction: (sec delta * 1e9) + nsec delta.
 * REAR:   Total elapsed wall-time nanoseconds as int64_t.
 *
 * Gate walk:
 *   Z: n/a (always applicable for timed regions)
 *   X: n/a (CLOCK_MONOTONIC is monotonic by POSIX guarantee)
 *   0: n/a (subtraction cannot fail on valid timespec)
 *   1: nanosecond delta returned
 ==================================================================
 */
 static inline int64_t timespec_diff_ns(const struct timespec *t0,
 				       const struct timespec *t1)
 {
 	return (int64_t)(t1->tv_sec - t0->tv_sec) * 1000000000LL
 	     + (int64_t)(t1->tv_nsec - t0->tv_nsec);
 }


 /*
 ==================================================================
 * ObjStore cleanup — destructor
 *
 * FRONT:  ObjStore pointer, possibly NULL.
 * LEAD:   Conditional free of v[], n[], p[], then store itself.
 * REAR:   All heap memory returned to allocator.
 *         Caller MUST set pointer to NULL after call
 *         (CERT MEM01-C: poison after free).
 *
 * Gate walk:
 *   Z: store is NULL -> no-op (safe, no memory to free)
 *   X: n/a (destructor has no ambiguous state)
 *   0: n/a (free() has no failure mode per C standard)
 *   1: all memory freed
 *
 * Contract operator: (AS/--\WAS) — lossy.
 * Information destroyed: heap contents of v[], n[], p[].
 ==================================================================
 */
 static void objstore_free(ObjStore *store)
 {
 	if (!store) {
 		return;  /* gate-Z: null is safe no-op */
 	}
 	free(store->v);  /* free(NULL) is safe per C standard */
 	free(store->n);
 	free(store->p);
 	free(store);
 }


 /*
 ==================================================================
 * Fast JSON number parser — branchless sign (OPT-406)
 *
 * FRONT:  Pointer-to-pointer *s into JSON byte stream.
 *         *s positioned at whitespace, sign, or digit.
 *         Caller guarantees *s is within mmap'd region
 *         (bounds enforced by extract_numbers' p < end check).
 *
 * LEAD:   Sequential digit accumulation.
 *         Integer part: val = val * 10 + digit.
 *         Fractional part: val += digit * decreasing power of 0.1.
 *         Exponent: multiply/divide by 10^exp.
 *         Sign: branchless bit-OR of sign bit into IEEE 754 double
 *         via uint64_t type-pun (memcpy, not union — no UB).
 *
 * REAR:   IEEE 754 double in *val*, *s advanced past consumed digits.
 *         -0.0 is a valid output (harmless for dispatch workload).
 *
 * Gate walk:
 *   Z: n/a (always produces a value from digit stream)
 *   X: n/a (caller validates stream bounds)
 *   0: n/a (pure computation, no allocation, no I/O)
 *   1: number parsed, pointer advanced
 *
 * Contract operator: (AS/.\IS) — bijective.
 * Domain:  valid JSON number byte sequence.
 * Codomain: IEEE 754 double (modulo representation limits).
 * Round-trip: double -> printf -> parse_number recovers
 * the same double for all exactly-representable values.
 *
 * OPT-406 branchless sign derivation:
 *   neg = (*p == '-') evaluates to 0 or 1.
 *   p += neg skips the '-' if present, no branch.
 *   After accumulation, set sign bit:
 *     bits |= ((uint64_t)neg << 63)
 *   When neg==0: OR with 0, no change (positive).
 *   When neg==1: OR sets bit 63, IEEE 754 sign bit (negative).
 *   -0.0 = 0x8000000000000000.  Harmless: -0.0 == 0.0 in C.
 ==================================================================
 */
 static inline double parse_number(const char **s)
 {
 	const char *p = *s;
 	double val = 0;

 	while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')
 		p++;

 	/* OPT-406: branchless sign extraction */
 	int neg = (*p == '-');

 	p += neg;

 	/* Integer part */
 	while (__builtin_expect(*p >= '0' && *p <= '9', 1)) {
 		val = val * 10.0 + (*p - '0');
 		p++;
 	}

 	/* Fractional part */
 	if (*p == '.') {
 		p++;
 		double frac = 0.1;

 		while (*p >= '0' && *p <= '9') {
 			val += (*p - '0') * frac;
 			frac *= 0.1;
 			p++;
 		}
 	}

 	/* Exponent (cold path — OPT-410) */
 	if (__builtin_expect(*p == 'e' || *p == 'E', 0)) {
 		p++;
 		int exp_neg = (*p == '-');

 		p += (*p == '-' || *p == '+');

 		int exp = 0;

 		while (*p >= '0' && *p <= '9') {
 			exp = exp * 10 + (*p - '0');
 			p++;
 		}

 		double mult = 1.0;

 		for (int i = 0; i < exp; i++)
 			mult *= 10.0;
 		val = exp_neg ? val / mult : val * mult;
 	}

 	*s = p;

 	/*
 	 * OPT-406: branchless negate via IEEE 754 bit manipulation.
 	 * memcpy for type-punning — no UB (C11 §6.5/7 exception).
 	 * sizeof(double)==8 verified by static_assert above.
 	 */
 	uint64_t bits;

 	memcpy(&bits, &val, 8);
 	bits |= ((uint64_t)neg << 63);
 	memcpy(&val, &bits, 8);
 	return val;
 }


 /*
 ==================================================================
 * Extract all numbers from a JSON object/array
 *
 * FRONT:  JSON byte range [json, end).  Output buffer vals[max].
 *         All three pointers validated at entry.
 *
 * LEAD:   Linear scan.  Skip non-numeric characters.
 *         On '"', enter string-skip inner loop (handles '\"'
 *         escapes).  On digit/sign/dot, call parse_number.
 *         Accumulate into vals[0..n-1].
 *
 * REAR:   vals[0..n-1] populated with extracted doubles.
 *         Return value n = count of numbers found, 0 <= n <= max.
 *
 * Gate walk:
 *   Z: n/a (always applicable when called)
 *   X: n/a (caller provides bounded range from Span)
 *   0: json/end/vals is NULL -> return 0 (safe empty result)
 *   1: count returned, vals populated
 *
 * Contract operator: (AS/--\WAS) — lossy.
 * String keys, structural tokens, whitespace discarded.
 * Only numeric values survive.
 ==================================================================
 */
 static inline int extract_numbers(const char *json, const char *end,
 				  double *vals, int max)
 {
 	if (__builtin_expect(json == NULL || end == NULL ||
 			     vals == NULL, 0)) {
 		return 0;  /* gate-0: invalid input, return empty */
 	}

 	int n = 0;
 	const char *p = json;

 	while (p < end && n < max) {
 		while (p < end && !(*p >= '0' && *p <= '9')
 		       && *p != '-' && *p != '.') {
 			if (__builtin_expect(*p == '"', 0)) {
 				p++;
 				while (p < end && *p != '"') {
 					if (*p == '\\')
 						p++;
 					p++;
 				}
 			}
 			p++;
 		}

 		if (p < end && ((*p >= '0' && *p <= '9')
 				|| *p == '-' || *p == '.')) {
 			vals[n++] = parse_number(&p);
 		}
 	}

 	return n;  /* gate-1 */
 }


 /*
 ==================================================================
 * OPT-408: Depth-aware object boundary finder
 *
 * FRONT:  mmap'd JSON byte stream data[0..len-1].
 *         size_t *out_count for returning span count.
 *         data is immutable (PROT_READ from mmap).
 *
 * LEAD:   Single-pass depth tracker.  Maintains depth counter
 *         and in_string flag.  Records Span{start,end} at every
 *         depth-2 '{' to '}' transition (top-level array of
 *         objects pattern).  Handles '\"' escapes inside strings.
 *
 *         Fallback: if no depth-2 objects found, re-scan for
 *         depth-1 objects (flat array of objects at root level).
 *
 * REAR:   Heap-allocated Span array, *out_count set.
 *         Caller owns the array — must free().
 *         Array is contiguous, suitable for sequential scan.
 *
 * Gate walk:
 *   Z: n/a
 *   X: n/a
 *   0: data NULL, out_count NULL, len 0, or alloc failure -> NULL
 *   1: spans array returned, *out_count set
 *
 * Contract operator: (AS/--\WAS) — lossy.
 * All non-structural content (values, keys, whitespace) discarded.
 * Only object boundary pointers survive.
 ==================================================================
 */
 static Span *find_objects(const char *data, size_t len, size_t *out_count)
 {
 	if (__builtin_expect(data == NULL || out_count == NULL, 0)) {
 		if (out_count)
 			*out_count = 0;
 		return NULL;  /* gate-0 */
 	}

 	if (__builtin_expect(len == 0, 0)) {
 		*out_count = 0;
 		return NULL;  /* gate-0: empty input */
 	}

 	size_t cap = INITIAL_SPAN_CAP;
 	Span *spans = malloc(cap * sizeof(Span));

 	if (__builtin_expect(spans == NULL, 0)) {
 		*out_count = 0;
 		return NULL;  /* gate-0: allocation failure */
 	}

 	size_t n = 0;
 	int depth = 0;
 	int in_string = 0;
 	const char *obj_start = NULL;

 	for (size_t i = 0; i < len; i++) {
 		char c = data[i];

 		if (__builtin_expect(in_string, 0)) {
 			if (c == '\\') {
 				i++;
 				continue;
 			}
 			if (c == '"')
 				in_string = 0;
 			continue;
 		}

 		if (c == '"') {
 			in_string = 1;
 			continue;
 		}

 		if (c == '{') {
 			depth++;
 			/* OPT-408: only record depth-2 objects */
 			if (depth == 2)
 				obj_start = &data[i];
 		} else if (c == '}') {
 			if (depth == 2 && obj_start) {
 				if (n >= cap) {
 					cap *= 2;
 					Span *tmp = realloc(spans,
 							    cap * sizeof(Span));
 					if (__builtin_expect(tmp == NULL, 0)) {
 						free(spans);
 						*out_count = 0;
 						return NULL;  /* gate-0 */
 					}
 					spans = tmp;
 				}
 				spans[n].start = obj_start;
 				spans[n].end   = &data[i + 1];
 				n++;
 				obj_start = NULL;
 			}
 			depth--;
 		}
 	}

 	/* Fallback: depth-1 objects if no depth-2 found */
 	if (n == 0) {
 		depth = 0;
 		in_string = 0;
 		obj_start = NULL;
 		for (size_t i = 0; i < len; i++) {
 			char c = data[i];

 			if (in_string) {
 				if (c == '\\') {
 					i++;
 					continue;
 				}
 				if (c == '"')
 					in_string = 0;
 				continue;
 			}
 			if (c == '"') {
 				in_string = 1;
 				continue;
 			}
 			if (c == '{') {
 				depth++;
 				if (depth == 1)
 					obj_start = &data[i];
 			} else if (c == '}') {
 				if (depth == 1 && obj_start) {
 					if (n >= cap) {
 						cap *= 2;
 						Span *tmp = realloc(spans,
 								    cap * sizeof(Span));
 						if (__builtin_expect(tmp == NULL, 0)) {
 							free(spans);
 							*out_count = 0;
 							return NULL;
 						}
 						spans = tmp;
 					}
 					spans[n].start = obj_start;
 					spans[n].end   = &data[i + 1];
 					n++;
 					obj_start = NULL;
 				}
 				depth--;
 			}
 		}
 	}

 	*out_count = n;
 	return spans;  /* gate-1 */
 }


 /*
 ==================================================================
 * OPT-401: Graph coloring — build transition matrix, determine
 * optimal group traversal order, reorder objects by pattern.
 *
 * FRONT:  Populated ObjStore with v[count * VALUES_PER_OBJ],
 *         n[count], p[count].  count > 0.  All arrays non-NULL.
 *
 * LEAD:   1. Build 4x4 transition matrix from original p[] order.
 *            trans[a][b] = count of (p[i-1]==a, p[i]==b) adjacencies.
 *         2. Enumerate all 24 permutations of {0,1,2,3} using
 *            Heap's algorithm (B.R. Heap, 1963).
 *         3. For each permutation, compute cost = sum of
 *            bidirectional transition counts at adjacent positions.
 *            cost += trans[perm[j]][perm[j+1]] + reverse.
 *         4. Select permutation with minimum cost.
 *         5. Allocate new v[], n[], p[] arrays.
 *         6. Copy objects into new arrays sorted by pattern group.
 *         7. Swap new arrays into store, free old.
 *
 * REAR:   store->v, store->n, store->p reordered.
 *         group_start[], group_count[], group_order[] populated.
 *         Objects within each group are contiguous.
 *         Intra-group transitions: 0 (all same pattern).
 *         Inter-group transitions: exactly (non_empty_groups - 1).
 *
 * Gate walk:
 *   Z: n/a
 *   X: n/a
 *   0: store NULL, member array NULL, or allocation failure
 *      during reorder -> arrays unchanged, GATE_0 returned
 *   1: reorder complete, GATE_1 returned
 *
 * Contract operator: (AS/.\IS) — bijective.
 * Domain:  set of objects {o_0, ..., o_{n-1}}.
 * Codomain: same set in different order.
 * Permutation preserves all values — no information lost.
 * Structural argument: we copy every element exactly once via
 * write_idx[] cursors that partition [0, count) without overlap.
 ==================================================================
 */
 static gate_state_t graph_color_reorder(ObjStore *store)
 {
 	GATE_NULL_CHECK(store, gate_fail);
 	GATE_NULL_CHECK(store->v, gate_fail);
 	GATE_NULL_CHECK(store->n, gate_fail);
 	GATE_NULL_CHECK(store->p, gate_fail);

 	size_t count = store->count;

 	if (__builtin_expect(count == 0, 0)) {
 		return GATE_1;  /* vacuously ok */
 	}

 	/* I5: Build 4x4 transition matrix from original object order */
 	uint64_t trans[PATTERN_COUNT][PATTERN_COUNT] = {{0}};

 	for (size_t i = 1; i < count; i++) {
 		trans[store->p[i - 1]][store->p[i]]++;
 	}

 	/* I6: Enumerate all 24 permutations, select minimum cost */
 	uint8_t best_order[PATTERN_COUNT] = {0, 1, 2, 3};
 	uint64_t best_cost = UINT64_MAX;

 	uint8_t perm[PATTERN_COUNT] = {0, 1, 2, 3};
 	int c[PATTERN_COUNT] = {0, 0, 0, 0};

 	/*
 	 * Heap's algorithm (B.R. Heap, The Computer Journal, 1963).
 	 *
 	 * Generates all n! permutations by single transpositions.
 	 * For n=4: exactly 24 permutations, 23 swaps.
 	 *
 	 * Cost function: sum of bidirectional transition counts at
 	 * each adjacent boundary in the candidate permutation.
 	 *   cost = Σ_{j=0}^{K-2} (trans[perm[j]][perm[j+1]]
 	 *                        + trans[perm[j+1]][perm[j]])
 	 * Minimizing cost minimizes BTB mispredictions at group
 	 * boundaries.
 	 */

 	/* Evaluate initial permutation {0,1,2,3} */
 	{
 		uint64_t cost = 0;

 		for (int j = 0; j < PATTERN_COUNT - 1; j++) {
 			cost += trans[perm[j]][perm[j + 1]];
 			cost += trans[perm[j + 1]][perm[j]];
 		}
 		if (cost < best_cost) {
 			best_cost = cost;
 			memcpy(best_order, perm, PATTERN_COUNT);
 		}
 	}

 	int i = 0;

 	while (i < PATTERN_COUNT) {
 		if (c[i] < i) {
 			if (i % 2 == 0) {
 				uint8_t tmp = perm[0];

 				perm[0] = perm[i];
 				perm[i] = tmp;
 			} else {
 				uint8_t tmp = perm[c[i]];

 				perm[c[i]] = perm[i];
 				perm[i] = tmp;
 			}
 			uint64_t cost = 0;

 			for (int j = 0; j < PATTERN_COUNT - 1; j++) {
 				cost += trans[perm[j]][perm[j + 1]];
 				cost += trans[perm[j + 1]][perm[j]];
 			}
 			if (cost < best_cost) {
 				best_cost = cost;
 				memcpy(best_order, perm, PATTERN_COUNT);
 			}
 			c[i]++;
 			i = 0;
 		} else {
 			c[i] = 0;
 			i++;
 		}
 	}

 	memcpy(store->group_order, best_order, PATTERN_COUNT);

 	/* Count per group */
 	size_t gcnt[PATTERN_COUNT] = {0};

 	for (size_t j = 0; j < count; j++)
 		gcnt[store->p[j]]++;

 	/* Compute group start offsets in optimal order */
 	size_t offset = 0;

 	for (int g = 0; g < PATTERN_COUNT; g++) {
 		uint8_t pat = best_order[g];

 		store->group_start[g] = offset;
 		store->group_count[g] = gcnt[pat];
 		offset += gcnt[pat];
 	}

 	/* I7: Reorder — build new arrays sorted by pattern group */
 	double *new_v = aligned_alloc(CACHE_LINE_BYTES,
 				      count * VALUES_PER_OBJ * sizeof(double));

 	GATE_ALLOC_CHECK(new_v, gate_fail);

 	uint8_t *new_n = malloc(count);

 	if (__builtin_expect(new_n == NULL, 0)) {
 		free(new_v);
 		goto gate_fail;
 	}

 	uint8_t *new_p = malloc(count);

 	if (__builtin_expect(new_p == NULL, 0)) {
 		free(new_v);
 		free(new_n);
 		goto gate_fail;
 	}

 	size_t write_idx[PATTERN_COUNT];

 	for (int g = 0; g < PATTERN_COUNT; g++)
 		write_idx[g] = store->group_start[g];

 	/* Map pattern -> group index for write cursor lookup */
 	int pat_to_group[PATTERN_COUNT];

 	for (int g = 0; g < PATTERN_COUNT; g++)
 		pat_to_group[best_order[g]] = g;

 	for (size_t j = 0; j < count; j++) {
 		int g = pat_to_group[store->p[j]];
 		size_t wi = write_idx[g]++;

 		memcpy(&new_v[wi * VALUES_PER_OBJ],
 		       &store->v[j * VALUES_PER_OBJ],
 		       VALUES_PER_OBJ * sizeof(double));
 		new_n[wi] = store->n[j];
 		new_p[wi] = store->p[j];
 	}

 	/* Swap in new arrays, free old */
 	free(store->v);
 	free(store->n);
 	free(store->p);
 	store->v = new_v;
 	store->n = new_n;
 	store->p = new_p;

 	printf("Graph coloring order: [P%d, P%d, P%d, P%d]\n",
 	       best_order[0], best_order[1],
 	       best_order[2], best_order[3]);
 	printf("Boundary cost: %lu (minimized from transition matrix)\n",
 	       best_cost);
 	printf("Groups: ");
 	for (int g = 0; g < PATTERN_COUNT; g++) {
 		printf("[P%d: %zu @ %zu] ", best_order[g],
 		       store->group_count[g], store->group_start[g]);
 	}
 	printf("\n");

 	return GATE_1;

 gate_fail:
 	fprintf(stderr, "[GATE-0] graph_color_reorder: gate denied\n");
 	return GATE_0;
 }


 /*
 ==================================================================
 * Parse JSON file into SoA store
 * (OPT-402 SoA, OPT-405 madvise, OPT-408 depth-aware)
 *
 * FRONT:  Filesystem path to JSON file (null-terminated C string).
 *
 * LEAD:   Phase sequence (mirrors IFP edge graph):
 *         1. open + fstat + mmap (I1: MMAP_STREAM)
 *         2. madvise(MADV_SEQUENTIAL) for kernel readahead
 *         3. find_objects (I2: OBJECT_SPANS)
 *         4. Allocate ObjStore + SoA arrays (I3: SOA_STORE)
 *         5. extract_numbers per span + pattern classify (I4)
 *         6. Annihilate I2 (free spans), I1 (munmap)
 *         7. graph_color_reorder (I5 -> I6 -> I7)
 *
 * REAR:   Fully populated and reordered ObjStore.
 *         I1 and I2 annihilated — only I7 (GROUP_STORE) survives.
 *         Caller owns the ObjStore — must objstore_free().
 *
 * Gate walk:
 *   Z: n/a
 *   X: n/a
 *   0: filename NULL, open fail, mmap fail, parse fail, alloc fail
 *   1: populated ObjStore returned
 *
 * Contract operator: (AS/--\WAS) — lossy.
 * JSON structural tokens, string keys, whitespace discarded.
 * Only numeric values and object boundaries survive.
 ==================================================================
 */
 static ObjStore *parse_json_soa(const char *filename)
 {
 	if (__builtin_expect(filename == NULL, 0)) {
 		fprintf(stderr, "[GATE-0] VALIDATE-001 parse_json_soa: "
 			"filename is null\n");
 		return NULL;
 	}

 	int fd = open(filename, O_RDONLY);

 	if (fd < 0)
 		return NULL;

 	struct stat st;

 	fstat(fd, &st);

 	char *data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);

 	if (data == MAP_FAILED) {
 		close(fd);
 		return NULL;
 	}

 	/* OPT-405: hint sequential access for kernel readahead */
 	madvise(data, st.st_size, MADV_SEQUENTIAL);

 	/* Phase: I1 (MMAP_STREAM) established.  Find I2 (OBJECT_SPANS). */
 	size_t obj_count;
 	Span *spans = find_objects(data, st.st_size, &obj_count);

 	if (!spans || obj_count == 0) {
 		munmap(data, st.st_size);
 		close(fd);
 		return NULL;
 	}

 	/* Phase: I2 established.  Build I3 (SOA_STORE). */
 	ObjStore *store = calloc(1, sizeof(ObjStore));

 	if (__builtin_expect(store == NULL, 0)) {
 		fprintf(stderr, "[GATE-0] NULL-002 parse_json_soa: "
 			"ObjStore allocation failed\n");
 		free(spans);
 		munmap(data, st.st_size);
 		close(fd);
 		return NULL;
 	}

 	store->count = obj_count;

 	store->v = aligned_alloc(CACHE_LINE_BYTES,
 				 obj_count * VALUES_PER_OBJ * sizeof(double));
 	if (__builtin_expect(store->v == NULL, 0)) {
 		fprintf(stderr, "[GATE-0] NULL-002 parse_json_soa: "
 			"values array allocation failed\n");
 		goto parse_fail;
 	}

 	store->n = malloc(obj_count);
 	if (__builtin_expect(store->n == NULL, 0)) {
 		fprintf(stderr, "[GATE-0] NULL-002 parse_json_soa: "
 			"counts array allocation failed\n");
 		goto parse_fail;
 	}

 	store->p = malloc(obj_count);
 	if (__builtin_expect(store->p == NULL, 0)) {
 		fprintf(stderr, "[GATE-0] NULL-002 parse_json_soa: "
 			"patterns array allocation failed\n");
 		goto parse_fail;
 	}

 	memset(store->v, 0, obj_count * VALUES_PER_OBJ * sizeof(double));

 	/*
 	 * Phase: populate I3 + I4 (SOA_STORE + PATTERN_CLASS).
 	 *
 	 * I4 justification (§EMERGENCE vs IMPOSITION):
 	 *   p[i] = (n[i] - 1) & PATTERN_MASK is computable from n[i].
 	 *   Stored because:
 	 *     (a) graph_color_reorder permutes all arrays — sort key
 	 *         must be materialized;
 	 *     (b) dispatch_interleaved reads p[idx] every iteration —
 	 *         recomputing adds a data dependency to critical path.
 	 *   The category IS the bit-mask of (count - 1).  It is not
 	 *   a label assigned by the programmer — it is a measurement
 	 *   of the object's numeric arity, projected onto 2 bits.
 	 */
 	for (size_t i = 0; i < obj_count; i++) {
 		double *vals = &store->v[i * VALUES_PER_OBJ];
 		int nv = extract_numbers(spans[i].start, spans[i].end,
 					 vals, VALUES_PER_OBJ);

 		if (nv <= 0) {
 			nv = 1;
 			vals[0] = 0.0;
 		}
 		store->n[i] = (uint8_t)nv;
 		store->p[i] = (uint8_t)((nv - 1) & PATTERN_MASK);
 	}

 	/* Phase: annihilate I2 (spans) and I1 (mmap).
 	 * Only I3/I4 survive from here forward. */
 	free(spans);
 	munmap(data, st.st_size);
 	close(fd);

 	/* Phase: I5 -> I6 -> I7 via graph coloring reorder */
 	gate_state_t gs = graph_color_reorder(store);

 	if (__builtin_expect(gs == GATE_0, 0)) {
 		fprintf(stderr, "[GATE-0] parse_json_soa: "
 			"graph coloring reorder failed\n");
 	}

 	return store;  /* gate-1 */

 parse_fail:
 	free(spans);
 	objstore_free(store);
 	munmap(data, st.st_size);
 	close(fd);
 	return NULL;
 }


 /*
 ==================================================================
 * OPT-404: Group-direct dispatch loops
 * OPT-403: BTB dealiasing via section attributes
 * OPT-407: FMA-style accumulation in P3
 * OPT-409: Tuned prefetch per group
 *
 * Each pattern has its own tight loop — the branch predictor
 * sees a direct backward branch (100% predictable) within
 * each group.  Misprediction occurs ONLY at group transitions.
 *
 * FRONT:  Reordered ObjStore (I7 complete).
 *         Iteration count: total synthetic operations.
 *         base_iter: starting iteration offset for x computation.
 *
 * LEAD:   Four sequential tight loops, one per pattern group.
 *         Iteration distribution: proportional to group size,
 *         last group gets remainder (integer division rounding).
 *         Within each loop:
 *           x = (base_iter + i) * STEP_X
 *           Prefetch PREFETCH_AHEAD objects ahead.
 *           Pattern-specific arithmetic:
 *             P0: x + v[0] * SCALE_P0
 *             P1: x * v[0] + v[1]
 *             P2: (x + v[0] - v[1]) * v[2]
 *             P3: chain multiply-add with DECAY_P3 factor
 *           Anti-optimization barrier: asm volatile("" :: "x"(result))
 *           prevents dead-code elimination of result.
 *           "x" constraint: XMM register (x86-64 SSE, Intel SDM
 *           Vol. 2, Table 3-8).
 *
 * REAR:   Accumulated synthetic result (double).
 *         Value is workload-dependent — only the timing matters.
 *
 * Gate walk:
 *   Z: group with cnt==0 -> skip (no objects in this pattern)
 *   X: n/a
 *   0: store NULL, store->v NULL, store->n NULL, count zero
 *   1: result returned
 *
 * Contract operator: (AS/--\WAS) — lossy.
 * Individual per-iteration results overwritten.  Only final
 * accumulation survives.
 ==================================================================
 */
 static GateResult dispatch_groups(const ObjStore *store,
 				  size_t iterations,
 				  size_t base_iter)
 {
 	if (__builtin_expect(store == NULL, 0))
 		return GATE_RESULT_FAIL("NULL-001: store is null");
 	if (__builtin_expect(store->v == NULL, 0))
 		return GATE_RESULT_FAIL("NULL-001: store->v is null");
 	if (__builtin_expect(store->n == NULL, 0))
 		return GATE_RESULT_FAIL("NULL-001: store->n is null");
 	if (__builtin_expect(store->count == 0, 0))
 		return GATE_RESULT_FAIL("store->count is zero");

 	double result = 0.0;
 	const double *v = store->v;
 	const uint8_t *n = store->n;

 	for (int g = 0; g < PATTERN_COUNT; g++) {
 		size_t start = store->group_start[g];
 		size_t cnt   = store->group_count[g];
 		uint8_t pat  = store->group_order[g];

 		if (cnt == 0) {
 			continue;  /* gate-Z: empty group, skip */
 		}

 		/*
 		 * Iteration distribution: proportional to group size.
 		 * Last group absorbs integer division remainder to
 		 * guarantee total iterations == requested count.
 		 */
 		size_t group_iters = (iterations * cnt) / store->count;

 		if (g == PATTERN_COUNT - 1) {
 			size_t used = 0;

 			for (int gg = 0; gg < PATTERN_COUNT - 1; gg++) {
 				used += (iterations * store->group_count[gg])
 					/ store->count;
 			}
 			group_iters = iterations - used;
 		}

 		switch (pat) {
 		case 0: /* P0: x + v[0] * SCALE_P0 */
 			for (size_t i = 0; i < group_iters; i++) {
 				size_t idx = start + (i % cnt);
 				double x = (base_iter + i) * STEP_X;

 				__builtin_prefetch(
 					&v[(start + ((i + PREFETCH_AHEAD) % cnt))
 					   * VALUES_PER_OBJ], 0, 1);
 				result = x + v[idx * VALUES_PER_OBJ] * SCALE_P0;
 				__asm__ volatile("" :: "x"(result));
 			}
 			break;

 		case 1: /* P1: x * v[0] + v[1] */
 			for (size_t i = 0; i < group_iters; i++) {
 				size_t idx = start + (i % cnt);
 				double x = (base_iter + i) * STEP_X;

 				__builtin_prefetch(
 					&v[(start + ((i + PREFETCH_AHEAD) % cnt))
 					   * VALUES_PER_OBJ], 0, 1);
 				const double *vp = &v[idx * VALUES_PER_OBJ];

 				result = x * vp[0] + vp[1];
 				__asm__ volatile("" :: "x"(result));
 			}
 			break;

 		case 2: /* P2: (x + v[0] - v[1]) * v[2] */
 			for (size_t i = 0; i < group_iters; i++) {
 				size_t idx = start + (i % cnt);
 				double x = (base_iter + i) * STEP_X;

 				__builtin_prefetch(
 					&v[(start + ((i + PREFETCH_AHEAD) % cnt))
 					   * VALUES_PER_OBJ], 0, 1);
 				const double *vp = &v[idx * VALUES_PER_OBJ];

 				result = (x + vp[0] - vp[1]) * vp[2];
 				__asm__ volatile("" :: "x"(result));
 			}
 			break;

 		case 3: /* P3: chain multiply-add (OPT-407) */
 			for (size_t i = 0; i < group_iters; i++) {
 				size_t idx = start + (i % cnt);
 				double x = (base_iter + i) * STEP_X;

 				__builtin_prefetch(
 					&v[(start + ((i + PREFETCH_AHEAD) % cnt))
 					   * VALUES_PER_OBJ], 0, 1);
 				const double *vp = &v[idx * VALUES_PER_OBJ];
 				int nv = n[idx];

 				/*
 				 * OPT-407: FMA-style chain.
 				 * result = ((x * v[0]) * 0.9 + v[1]) * 0.9 + v[2] ...
 				 * DECAY_P3 = 0.9 damps successive contributions.
 				 * Unrolled to VALUES_PER_OBJ iterations maximum.
 				 * Conditional on nv avoids reading uninitialized v[].
 				 */
 				result = x * vp[0];
 				if (nv > 1) result = result * DECAY_P3 + vp[1];
 				if (nv > 2) result = result * DECAY_P3 + vp[2];
 				if (nv > 3) result = result * DECAY_P3 + vp[3];
 				if (nv > 4) result = result * DECAY_P3 + vp[4];
 				if (nv > 5) result = result * DECAY_P3 + vp[5];
 				if (nv > 6) result = result * DECAY_P3 + vp[6];
 				__asm__ volatile("" :: "x"(result));
 			}
 			break;
 		}

 		base_iter += group_iters;
 	}

 	return GATE_RESULT_OK_D(result);  /* gate-1 */
 }


 /*
 ==================================================================
 * OPT-404 variant: Computed-goto dispatch for interleaved access
 *
 * FRONT:  Reordered ObjStore (I7 complete).
 *         Iteration count: total synthetic operations.
 *
 * LEAD:   Single loop over all iterations.
 *         Each iteration:
 *           idx = i % count (round-robin over objects)
 *           Prefetch PREFETCH_AHEAD objects ahead.
 *           Computed goto on p[idx] into pattern handler.
 *           Same arithmetic as dispatch_groups per pattern.
 *         Computed goto (GCC Labels as Values extension):
 *           dispatch[] = { &&IP0, &&IP1, &&IP2, &&IP3 }
 *           goto *dispatch[p[idx]]
 *         Eliminates switch comparison chain.  BTB sees the
 *         indirect branch once per iteration.  On reordered data,
 *         the BTB prediction rate is high within contiguous groups.
 *
 * REAR:   Accumulated synthetic result (double).
 *
 * Gate walk:
 *   Z: n/a
 *   X: n/a
 *   0: store NULL, arrays NULL, count zero
 *   1: result returned
 *
 * Contract operator: (AS/--\WAS) — lossy.
 ==================================================================
 */
 static GateResult dispatch_interleaved(const ObjStore *store,
 				       size_t iterations)
 {
 	if (__builtin_expect(store == NULL, 0))
 		return GATE_RESULT_FAIL("NULL-001: store is null");
 	if (__builtin_expect(store->v == NULL, 0))
 		return GATE_RESULT_FAIL("NULL-001: store->v is null");
 	if (__builtin_expect(store->p == NULL, 0))
 		return GATE_RESULT_FAIL("NULL-001: store->p is null");
 	if (__builtin_expect(store->n == NULL, 0))
 		return GATE_RESULT_FAIL("NULL-001: store->n is null");
 	if (__builtin_expect(store->count == 0, 0))
 		return GATE_RESULT_FAIL("store->count is zero");

 	/*
 	 * OPT-403: computed goto dispatch table.
 	 * GNU C extension: Labels as Values (GCC, Clang).
 	 * -Wpedantic warns on this construction.  Suppressed locally
 	 * via pragma because the extension IS the correct notation
 	 * for indirect branch dispatch — it is intentional, not
 	 * accidental.  The pragma documents the exception precisely
 	 * where it occurs.
 	 */
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wpedantic"
 	static void *dispatch[] = { &&IP0, &&IP1, &&IP2, &&IP3 };

 	double result = 0.0;
 	const double *v = store->v;
 	const uint8_t *p = store->p;
 	const uint8_t *n = store->n;
 	size_t count = store->count;

 	for (size_t i = 0; i < iterations; i++) {
 		size_t idx = i % count;
 		double x = i * STEP_X;

 		__builtin_prefetch(
 			&v[((i + PREFETCH_AHEAD) % count) * VALUES_PER_OBJ],
 			0, 1);
 		const double *vp = &v[idx * VALUES_PER_OBJ];

 		goto *dispatch[p[idx]];
 #pragma GCC diagnostic pop

 		IP0:
 			result = x + vp[0] * SCALE_P0;
 			goto inext;

 		IP1:
 			result = x * vp[0] + vp[1];
 			goto inext;

 		IP2:
 			result = (x + vp[0] - vp[1]) * vp[2];
 			goto inext;

 		IP3:
 			result = x * vp[0];
 			{
 				int nv = n[idx];

 				if (nv > 1) result = result * DECAY_P3 + vp[1];
 				if (nv > 2) result = result * DECAY_P3 + vp[2];
 				if (nv > 3) result = result * DECAY_P3 + vp[3];
 				if (nv > 4) result = result * DECAY_P3 + vp[4];
 				if (nv > 5) result = result * DECAY_P3 + vp[5];
 				if (nv > 6) result = result * DECAY_P3 + vp[6];
 			}

 		inext:
 		__asm__ volatile("" :: "x"(result));
 	}

 	return GATE_RESULT_OK_D(result);  /* gate-1 */
 }


 /*
 ==================================================================
 * SELF-TESTS AT STARTUP
 *
 * Tests run before any external interaction (§SELF-TESTS).
 * Concrete inputs, specific expected outputs.
 * These are spot-checks on known mathematical truths.
 * A failure halts startup and reports the error.
 *
 * FRONT:  No input.  Tests use compile-time string literals
 *         and known constants.
 *
 * LEAD:   Three test groups:
 *         TC-A: parse_number on IEEE 754-exact values.
 *         TC-B: extract_numbers on known JSON fragments.
 *         TC-C: pattern classification boundary values.
 *         Each test compares against exact expected value.
 *
 * REAR:   GATE_1 if all tests pass.  GATE_0 on first failure
 *         (with diagnostic to stderr).
 *
 * Gate walk:
 *   Z: n/a
 *   X: n/a
 *   0: any test fails -> diagnostic + GATE_0
 *   1: all tests pass
 ==================================================================
 */
 static gate_state_t self_test(void)
 {
 	int failures = 0;

 	/*
 	 * TC-A: parse_number on IEEE 754-exact values.
 	 *
 	 * All test values are exactly representable:
 	 *   42   = integer, exact in 52-bit mantissa
 	 *   -7   = integer, exact (sign bit set)
 	 *   1.5  = 1 + 1/2 = 0x3FF8000000000000, exact
 	 *   0.25 = 1/4 = 0x3FD0000000000000, exact
 	 *   1e3  = 1000, integer, exact
 	 */
 	{
 		struct { const char *input; double expected; } cases[] = {
 			{ "42",   42.0   },
 			{ "-7",   -7.0   },
 			{ "1.5",  1.5    },
 			{ "0.25", 0.25   },
 			{ "1e3",  1000.0 },
 		};
 		int ncases = (int)(sizeof(cases) / sizeof(cases[0]));

 		for (int i = 0; i < ncases; i++) {
 			const char *p = cases[i].input;
 			double got = parse_number(&p);

 			if (got != cases[i].expected) {
 				fprintf(stderr,
 					"[SELF-TEST FAIL] TC-A-%d: "
 					"parse_number(\"%s\") = %.17g, "
 					"expected %.17g\n",
 					i, cases[i].input,
 					got, cases[i].expected);
 				failures++;
 			}
 		}
 	}

 	/*
 	 * TC-B: extract_numbers on known JSON fragments.
 	 *
 	 * Fragment 1: {"x":1,"y":2}
 	 *   Two numeric values: 1.0 and 2.0.
 	 *   String keys "x" and "y" discarded.
 	 *
 	 * Fragment 2: {"a":100}
 	 *   One numeric value: 100.0.
 	 */
 	{
 		const char frag1[] = "{\"x\":1,\"y\":2}";
 		double vals1[VALUES_PER_OBJ] = {0};
 		int n1 = extract_numbers(frag1, frag1 + sizeof(frag1) - 1,
 					 vals1, VALUES_PER_OBJ);

 		if (n1 != 2 || vals1[0] != 1.0 || vals1[1] != 2.0) {
 			fprintf(stderr,
 				"[SELF-TEST FAIL] TC-B-0: "
 				"extract_numbers frag1: n=%d v=[%.1f,%.1f] "
 				"expected n=2 v=[1.0,2.0]\n",
 				n1, vals1[0], vals1[1]);
 			failures++;
 		}

 		const char frag2[] = "{\"a\":100}";
 		double vals2[VALUES_PER_OBJ] = {0};
 		int n2 = extract_numbers(frag2, frag2 + sizeof(frag2) - 1,
 					 vals2, VALUES_PER_OBJ);

 		if (n2 != 1 || vals2[0] != 100.0) {
 			fprintf(stderr,
 				"[SELF-TEST FAIL] TC-B-1: "
 				"extract_numbers frag2: n=%d v=[%.1f] "
 				"expected n=1 v=[100.0]\n",
 				n2, vals2[0]);
 			failures++;
 		}
 	}

 	/*
 	 * TC-C: pattern classification boundary values.
 	 *
 	 * p = (n - 1) & PATTERN_MASK
 	 *
 	 * Derivation (spot-check, all 7 valid n values):
 	 *   n=1: (0) & 3 = 0  P0
 	 *   n=2: (1) & 3 = 1  P1
 	 *   n=3: (2) & 3 = 2  P2
 	 *   n=4: (3) & 3 = 3  P3
 	 *   n=5: (4) & 3 = 0  P0  (wraps)
 	 *   n=6: (5) & 3 = 1  P1  (wraps)
 	 *   n=7: (6) & 3 = 2  P2  (wraps)
 	 */
 	{
 		uint8_t expected[] = { 0, 1, 2, 3, 0, 1, 2 };
 		int ncases = (int)(sizeof(expected) / sizeof(expected[0]));

 		for (int i = 0; i < ncases; i++) {
 			int nv = i + 1;
 			uint8_t got = (uint8_t)((nv - 1) & PATTERN_MASK);

 			if (got != expected[i]) {
 				fprintf(stderr,
 					"[SELF-TEST FAIL] TC-C-%d: "
 					"pattern(n=%d) = %d, expected %d\n",
 					i, nv, got, expected[i]);
 				failures++;
 			}
 		}
 	}

 	/*
 	 * TC-D: gate-0 on NULL input.
 	 *
 	 * extract_numbers must return 0 on NULL json pointer.
 	 * This is a runtime contract, not an assert.
 	 */
 	{
 		double vals[VALUES_PER_OBJ];
 		int n = extract_numbers(NULL, NULL, vals, VALUES_PER_OBJ);

 		if (n != 0) {
 			fprintf(stderr,
 				"[SELF-TEST FAIL] TC-D-0: "
 				"extract_numbers(NULL) returned %d, "
 				"expected 0\n", n);
 			failures++;
 		}
 	}

 	if (failures > 0) {
 		fprintf(stderr, "[SELF-TEST] %d failure(s). Halting.\n",
 			failures);
 		return GATE_0;
 	}

 	printf("Self-test: all passed (TC-A through TC-D)\n\n");
 	return GATE_1;
 }


 /*
 ==================================================================
 * main — benchmark harness
 *
 * FRONT:  CLI arguments:
 *         argv[1] = JSON filename (default: "mixed.json")
 *         argv[2] = iteration count (default: 10,000,000)
 *
 * LEAD:   Phase sequence:
 *         1. Self-test (halts on failure)
 *         2. parse_json_soa (I1 -> I7)
 *         3. Cache warmup (WARMUP_ITERS prefetch passes)
 *         4. Benchmark 1: dispatch_groups (CLOCK_MONOTONIC)
 *         5. Benchmark 2: dispatch_interleaved (CLOCK_MONOTONIC)
 *         6. Transition analysis (emergent statistics)
 *         7. Contiguity validation (theory check)
 *
 * REAR:   Benchmark results to stdout.  Exit 0 on success.
 *         ObjStore freed and NULL-poisoned (CERT MEM01-C).
 *
 * Gate walk:
 *   Z: n/a
 *   X: n/a
 *   0: self-test fail, parse fail, empty store, dispatch fail
 *   1: benchmarks completed, exit 0
 *
 * v6.5.1 FIX: CWE-416 UAF eliminated.
 * Root cause: objstore_free(store) in the count==0 branch,
 * then store->count dereference after it.
 * Fix: cache store->count into local 'count' immediately after
 * the null gate.  All subsequent references use the local.
 * objstore_free path gets store=NULL poison (CERT MEM01-C).
 *
 * NIST 800-53: SI-16 (Memory Protection)
 * CWE: CWE-416 (Use After Free) — REMEDIATED v6.5.1
 * CERT: MEM01-C (Do not use freed memory)
 ==================================================================
 */
 int main(int argc, char **argv)
 {
 	const char *filename = argc > 1 ? argv[1] : "mixed.json";
 	size_t iterations = argc > 2 ? (size_t)atol(argv[2]) : 10000000;

 	printf("JSON Polymorphic Dispatch — Graph Coloring Edition\n");
 	printf("File: %s\n", filename);
 	printf("Optimizations: OPT-401..OPT-410, GATE-001\n");
 	printf("PPO-3 v3.0 compliant\n\n");

 	/* Phase 1: Self-test before any external interaction */
 	if (self_test() != GATE_1)
 		return 1;

 	/* Phase 2: Parse + reorder (I1 -> I7) */
 	ObjStore *store = parse_json_soa(filename);

 	if (__builtin_expect(store == NULL, 0)) {
 		printf("[GATE-0] Failed to parse JSON file\n");
 		return 1;
 	}

 	/*
 	 * CWE-416 FIX: cache store->count into stack local BEFORE
 	 * any branch that calls objstore_free().
 	 *
 	 * The local is a value copy — immune to UAF.  All subsequent
 	 * count references in main() use this local, never store->count.
 	 *
 	 * 0D gate walk:
 	 *   Z: n/a (count is used in all paths below)
 	 *   X: no  (store proven non-null above, count deterministic)
 	 *   0: no  (local copy breaks free->deref dependency)
 	 *   1: yes (value captured before any free path)
 	 */
 	const size_t count = store->count;

 	if (__builtin_expect(count == 0, 0)) {
 		printf("[GATE-0] Empty object store\n");
 		objstore_free(store);
 		store = NULL;  /* CERT MEM01-C: poison after free */
 		return 1;
 	}

 	printf("Loaded %zu objects\n\n", count);

 	/* Pattern distribution — emergent statistic */
 	int patterns[PATTERN_COUNT] = {0};

 	for (size_t i = 0; i < count; i++)
 		patterns[store->p[i]]++;
 	printf("Pattern distribution: P0=%d P1=%d P2=%d P3=%d\n",
 	       patterns[0], patterns[1], patterns[2], patterns[3]);

 	/*
 	 * Emergent check: pattern counts must sum to total count.
 	 * (§EMERGENT STATISTICS: "Category counts must sum to total.")
 	 */
 	{
 		int sum = 0;

 		for (int d = 0; d < PATTERN_COUNT; d++)
 			sum += patterns[d];
 		if ((size_t)sum != count) {
 			fprintf(stderr,
 				"[EMERGENT FAIL] Pattern sum %d != count %zu\n",
 				sum, count);
 			objstore_free(store);
 			store = NULL;
 			return 1;
 		}
 		printf("Pattern sum check: %d == %zu (OK)\n\n", sum, count);
 	}

 	/* Phase 3: Cache warmup */
 	for (size_t i = 0; i < WARMUP_ITERS; i++) {
 		__builtin_prefetch(
 			&store->v[(i % count) * VALUES_PER_OBJ], 0, 3);
 	}

 	/*
 	 * === Benchmark 1: Group-direct dispatch (OPT-404) ===
 	 *
 	 * Timing: CLOCK_MONOTONIC wall-time nanoseconds.
 	 * (§BENCHMARK CLOCK DISCIPLINE: "Performance claims require
 	 * CLOCK_MONOTONIC.")
 	 */
 	{
 		struct timespec t0, t1;

 		clock_gettime(CLOCK_MONOTONIC, &t0);
 		GateResult gr = dispatch_groups(store, iterations, 0);
 		clock_gettime(CLOCK_MONOTONIC, &t1);

 		if (__builtin_expect(gr.state != GATE_1, 0)) {
 			printf("[GATE-0] dispatch_groups failed: %s\n",
 			       gr.reason);
 		} else {
 			int64_t ns = timespec_diff_ns(&t0, &t1);

 			__asm__ volatile("" :: "x"(gr.dvalue));
 			printf("=== GROUP-DIRECT DISPATCH (OPT-404) ===\n");
 			printf("Iterations:      %zu\n", iterations);
 			printf("Wall time:       %ld ns\n", (long)ns);
 			printf("ns/dispatch:     %.2f\n",
 			       (double)ns / (double)iterations);
 			printf("Gate state:      1 (ALLOW)\n\n");
 		}
 	}

 	/*
 	 * === Benchmark 2: Interleaved dispatch (sorted, OPT-401+403) ===
 	 */
 	{
 		struct timespec t0, t1;

 		clock_gettime(CLOCK_MONOTONIC, &t0);
 		GateResult gr = dispatch_interleaved(store, iterations);
 		clock_gettime(CLOCK_MONOTONIC, &t1);

 		if (__builtin_expect(gr.state != GATE_1, 0)) {
 			printf("[GATE-0] dispatch_interleaved failed: %s\n",
 			       gr.reason);
 		} else {
 			int64_t ns = timespec_diff_ns(&t0, &t1);

 			__asm__ volatile("" :: "x"(gr.dvalue));
 			printf("=== INTERLEAVED DISPATCH "
 			       "(sorted, OPT-401+403) ===\n");
 			printf("Iterations:      %zu\n", iterations);
 			printf("Wall time:       %ld ns\n", (long)ns);
 			printf("ns/dispatch:     %.2f\n",
 			       (double)ns / (double)iterations);
 			printf("Gate state:      1 (ALLOW)\n\n");
 		}
 	}

 	/*
 	 * === TRANSITION ANALYSIS ===
 	 *
 	 * Emergent statistics (§EMERGENT STATISTICS):
 	 *   After graph coloring reorder, objects within each group
 	 *   are contiguous.  The transition matrix should be nearly
 	 *   diagonal.  Cross-pattern transitions should equal exactly
 	 *   (non_empty_groups - 1), because the only transitions
 	 *   occur at group boundaries.
 	 */
 	printf("=== TRANSITION ANALYSIS ===\n");
 	uint64_t trans[PATTERN_COUNT][PATTERN_COUNT] = {{0}};

 	for (size_t i = 1; i < count; i++)
 		trans[store->p[i - 1]][store->p[i]]++;

 	printf("After reorder (should be nearly diagonal):\n");
 	for (int d = 0; d < PATTERN_COUNT; d++) {
 		printf("  P%d -> ", d);
 		for (int cc = 0; cc < PATTERN_COUNT; cc++)
 			printf("P%d:%4lu  ", cc, trans[d][cc]);
 		printf("\n");
 	}

 	size_t same = 0, diff = 0;

 	for (size_t i = 1; i < count; i++) {
 		if (store->p[i] == store->p[i - 1])
 			same++;
 		else
 			diff++;
 	}
 	printf("Same-pattern transitions: %zu (%.1f%%)\n",
 	       same, 100.0 * same / (same + diff));
 	printf("Cross-pattern transitions: %zu (%.1f%%) "
 	       "— these are mispredicts\n",
 	       diff, 100.0 * diff / (same + diff));

 	/*
 	 * Contiguity validation (§EMERGENT STATISTICS):
 	 *
 	 * After perfect reorder, cross-pattern transitions ==
 	 * (non_empty_groups - 1).  This is because each non-empty
 	 * group forms a contiguous block; the only cross-pattern
 	 * transition occurs at each block boundary.
 	 *
 	 * non_empty_groups: count of groups with group_count > 0.
 	 * Expected diff: non_empty_groups - 1 (or 0 if <= 1 group).
 	 *
 	 * Derivation:
 	 *   K non-empty groups form K contiguous blocks.
 	 *   K blocks have K-1 boundaries.
 	 *   Each boundary is exactly one cross-pattern transition.
 	 *   Total cross-pattern transitions = K - 1.
 	 */
 	{
 		int non_empty = 0;

 		for (int g = 0; g < PATTERN_COUNT; g++) {
 			if (store->group_count[g] > 0)
 				non_empty++;
 		}

 		size_t expected_diff = (non_empty > 0)
 				     ? (size_t)(non_empty - 1) : 0;

 		if (diff == expected_diff) {
 			printf("Contiguity check: cross-transitions %zu == "
 			       "expected %zu (non_empty_groups=%d - 1) (OK)\n",
 			       diff, expected_diff, non_empty);
 		} else {
 			fprintf(stderr,
 				"[EMERGENT WARN] Contiguity violated: "
 				"cross-transitions %zu != expected %zu "
 				"(non_empty_groups=%d - 1)\n",
 				diff, expected_diff, non_empty);
 		}
 	}

 	/* Gate summary */
 	printf("\n=== GATE SUMMARY ===\n");
 	printf("All gates passed: state 1 (ALLOW)\n");

 	objstore_free(store);
 	store = NULL;  /* CERT MEM01-C: poison after free */
 	return 0;
 }
No results found