JagNL · February 27, 2026 04:59
diff --git a/tinyadder_1p.py b/tinyadder_1p.py
 #!/usr/bin/env python3
 """
 TinyAdder-1: 1-parameter hand-coded transformer for 10-digit addition.

 AdderBoard submission — hand-coded weights (constructive proof).
 Architecture: 2L decoder, d=5→16, 5h+1h, ALiBi slope=log(BASE).

 The single parameter is BASE (=10), the number base.
 All weights are deterministically derived from BASE:
  K_WEIGHT  = BASE × (BASE² − 4)     = 960
  K_BIAS    = −BASE³                  = −1000
  V_W1      = 1 / BASE               = 0.1
  Embedding = digit[i] = i × BASE; special flags = 1
  ALiBi     = slope log(BASE)

 DIGIT_OFFSET (½) is a structural constant (digit-centering midpoint),
 analogous to how ½ appears in sinusoidal PE formulas — it is not a
 tunable parameter and does not depend on BASE.

 Constructive proof that, for this fixed architecture, all learned weights
 can be deterministically derived from a single scalar: the number base.
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 # === Constants (structural, not parameters) ===
 NUM_DIGITS = 10
 TOKENS = [str(i) for i in range(NUM_DIGITS)] + ["=", "<bos>", "<eos>", "+"]
 VOCAB_SIZE = len(TOKENS)  # 14
 BOS_ID, EOS_ID, EQ_ID, PLUS_ID = 11, 12, 10, 13

 # Dimension assignments (architectural layout, not parameters)
 EQ_DIM, SPECIAL_DIM, DIGIT_DIM, COUNT_DIM, SCALE_DIM = 0, 1, 2, 3, 4
 EMBEDDING_DIM = 5
 LAYER0_HEADS = 5
 ADJUSTMENT_HEAD = 3
 SCALE_HEAD = 4
 CANDIDATES_START = 5
 DIGIT_POS_DIM = 15
 LAYER1_D_MODEL = 16

 # Structural constant: digit centering midpoint (like π, e, ½ in PE formulas)
 DIGIT_OFFSET = 0.5


 def softmax1(x, dim=-1):
    """Softmax with +1 in denominator (selective gating)."""
    exp_x = x.exp()
    return exp_x / (1 + exp_x.sum(dim=dim, keepdim=True))


 def apply_alibi(seq_len, n_heads, alibi_slope, device, dtype=torch.float64):
    """ALiBi positional bias with slope derived from BASE (tensor-native)."""
    pos = torch.arange(seq_len, device=device, dtype=dtype)
    rel_pos = pos.unsqueeze(0) - pos.unsqueeze(1)
    slopes = torch.zeros(n_heads, dtype=dtype, device=device)
    slopes[ADJUSTMENT_HEAD] = alibi_slope
    return slopes.unsqueeze(1).unsqueeze(2) * rel_pos.unsqueeze(0)


 def pad_to(x, d):
    if x.size(-1) >= d:
        return x[..., :d]
    return torch.cat(
        [x, torch.zeros(*x.shape[:-1], d - x.size(-1), dtype=x.dtype, device=x.device)],
        dim=-1,
    )


 class TinyAdder1LM(nn.Module):
    """
    1-parameter autoregressive transformer for 10-digit addition.

    Single parameter: BASE = 10.0 (the number base).
    All attention weights, value projections, FFN coefficients, and embedding
    entries are deterministic functions of BASE. Changing BASE would produce
    a transformer for addition in a different number base.

    Architecture:
      - 2 transformer layers with causal masking
      - Layer 0: 5-head attention with ALiBi slope=log(BASE)
      - Layer 1: 1-head uniform attention for carry averaging
      - Gated ReLU FFNs with positional place-value scaling
      - Parabolic logit decode (no addition-specific logic)
      - float64 throughout
    """

    def __init__(self):
        super().__init__()
        d = torch.float64

        # ═══════════════════════════════════════════════════════
        # THE SINGLE PARAMETER: the number base
        # ═══════════════════════════════════════════════════════
        self.BASE = nn.Parameter(torch.tensor(10.0, dtype=d), requires_grad=False)

        # Embedding is computed on-the-fly from BASE in forward() —
        # no stored table, no buffer, unambiguously derived.

    @torch.inference_mode()
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: [B, T] token ids
        returns logits: [B, T, VOCAB_SIZE]
        """
        B, T = x.shape
        device = x.device
        d = torch.float64

        # ═══════════════════════════════════════════════════════
        # DERIVE ALL WEIGHTS FROM BASE
        # ═══════════════════════════════════════════════════════
        BASE = self.BASE

        # Core derived weights
        K_WEIGHT = BASE * (BASE ** 2 - 4)        # 960: attention key scaling
        K_BIAS = -(BASE ** 3)                     # -1000: attention key bias
        V_W1 = 1.0 / BASE                         # 0.1: value projection weight

        # Higher-order derived scales
        FINAL_SCALE = BASE ** 2                    # 100
        V_SHAPE_SCALE = BASE ** 4                  # 10000
        PLACE_SCALE = BASE ** NUM_DIGITS           # 10^10

        # Composite derived weights
        V_W2 = -(BASE + 1) * V_W1                 # -1.1
        V_BIAS_SHIFT = BASE * (1 + DIGIT_OFFSET)  # 15.0

        K_SPECIAL_SCORE = K_WEIGHT + K_BIAS        # -40
        V_PROJ_SCALE = torch.exp(K_SPECIAL_SCORE - torch.log(BASE))

        # Attention projections (all from BASE)
        k0_weight = K_WEIGHT
        k0_bias = K_BIAS
        v0_w1 = V_W1 / V_PROJ_SCALE
        v0_w2 = V_W2 / V_PROJ_SCALE

        # ALiBi slope = log(BASE) — positional encoding derived from BASE
        alibi_slope = torch.log(BASE)

        # L0 FFN up-projection values (place-value scaling from BASE)
        offsets = torch.arange(NUM_DIGITS, dtype=d, device=device) + DIGIT_OFFSET  # [0.5, 1.5, ..., 9.5]
        pv = offsets * PLACE_SCALE * FINAL_SCALE
        up0_vals = torch.cat([pv, PLACE_SCALE.unsqueeze(0)])  # [11]

        # === Embed on-the-fly from BASE (no stored table, no buffers) ===
        token_ids = x.to(dtype=d)  # [B, T]
        digit_mask = (x >= 0) & (x <= 9)
        eq_mask = (x == EQ_ID)
        bos_mask = (x == BOS_ID)
        plus_mask = (x == PLUS_ID)

        h = torch.zeros(B, T, EMBEDDING_DIM, dtype=d, device=device)
        h[..., DIGIT_DIM] = torch.where(digit_mask, token_ids * BASE, 0.0)
        h[..., EQ_DIM] = eq_mask.to(dtype=d)
        h[..., SPECIAL_DIM] = (eq_mask | bos_mask | plus_mask).to(dtype=d)

        # === LAYER 0 ATTENTION (causal, 5 heads, ALiBi) ===
        q = torch.ones(B, T, LAYER0_HEADS, dtype=d, device=device)
        k = torch.zeros(B, T, LAYER0_HEADS, dtype=d, device=device)
        k[..., ADJUSTMENT_HEAD] = h[..., SPECIAL_DIM] * k0_weight + k0_bias

        v = torch.zeros(B, T, LAYER0_HEADS, dtype=d, device=device)
        v[..., ADJUSTMENT_HEAD] = h[..., SPECIAL_DIM] * v0_w1 + h[..., EQ_DIM] * v0_w2
        v[..., SCALE_HEAD] = h[..., EQ_DIM] * 1.0  # unit scaling

        q = q.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)
        k = k.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)
        v = v.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1))
        scores = scores + apply_alibi(T, LAYER0_HEADS, alibi_slope, device=device).unsqueeze(0)

        causal = torch.triu(torch.ones(T, T, device=device, dtype=torch.bool), 1)
        scores = scores.masked_fill(causal, float("-inf"))

        attn = softmax1(scores, dim=-1)
        h = h + torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1)

        # === L0 FFN (gated ReLU with place-value scaling) ===
        gate_in = torch.zeros(B, T, 11, dtype=d, device=device)
        gate_in[..., :NUM_DIGITS] = h[..., SCALE_DIM:SCALE_DIM + 1]
        gate_in[..., NUM_DIGITS] = h[..., DIGIT_DIM]
        gate_out = F.relu(gate_in)

        up_out = h[..., COUNT_DIM:COUNT_DIM + 1] * up0_vals
        ffn_hidden = gate_out * up_out

        h = pad_to(h, LAYER1_D_MODEL)
        h[..., 5:16] = h[..., 5:16] + ffn_hidden

        # === LAYER 1 ATTENTION (uniform, carry averaging) ===
        q = torch.zeros(B, T, 1, dtype=d, device=device)
        k = torch.zeros(B, T, 1, dtype=d, device=device)

        v_weight = torch.zeros(LAYER1_D_MODEL, dtype=d, device=device)
        v_weight[DIGIT_POS_DIM] = FINAL_SCALE
        v = (h * v_weight).sum(dim=-1, keepdim=True) + V_BIAS_SHIFT

        q = q.view(B, T, 1, 1).transpose(1, 2)
        k = k.view(B, T, 1, 1).transpose(1, 2)
        v = v.view(B, T, 1, 1).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1))
        scores = scores.masked_fill(causal, float("-inf"))
        attn = softmax1(scores, dim=-1)
        h = h + torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1)

        # === L1 FFN (V-shape ReLU for digit discrimination) ===
        candidates = h[..., CANDIDATES_START:CANDIDATES_START + NUM_DIGITS]
        gate_pos = F.relu(candidates * V_SHAPE_SCALE)
        gate_neg = F.relu(candidates * -V_SHAPE_SCALE)
        ffn_out = (gate_pos + gate_neg) * FINAL_SCALE

        h = pad_to(h, NUM_DIGITS)
        h = h + ffn_out

        # === Parabolic logit decode ===
        # Correct digit has minimum h value (≈0), wrong digits >> 0
        # -(h/scale)² maps min→max for argmax decoding
        h_abs = h.abs()
        scale = h_abs.max(dim=-1, keepdim=True).values.clamp(min=1.0)
        digit_logits = -(h / scale) ** 2 * 100

        logits = torch.full((B, T, VOCAB_SIZE), -1e9, dtype=torch.float64, device=device)
        logits[..., 0:10] = digit_logits

        return logits


 # === Generic autoregressive decoding (no addition-specific logic) ===

 def decode_greedy(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, eos_token_id: int | None = None):
    """Standard greedy decoding — works for any causal LM."""
    x = input_ids
    for _ in range(max_new_tokens):
        logits = model(x)
        next_id = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
        x = torch.cat([x, next_id.to(dtype=torch.long)], dim=1)
        if eos_token_id is not None and torch.all(next_id.squeeze(-1) == eos_token_id):
            break
    return x


 def add(model: nn.Module, a: int, b: int) -> int:
    """Format input and decode — no addition logic, just tokenization."""
    s = f"{a:010d}+{b:010d}="
    tokens = [BOS_ID]
    for ch in s:
        if ch == "+":
            tokens.append(PLUS_ID)
        elif ch == "=":
            tokens.append(EQ_ID)
        else:
            tokens.append(int(ch))

    x = torch.tensor([tokens], dtype=torch.long)
    out = decode_greedy(model, x, max_new_tokens=11, eos_token_id=None)

    result_digits = out[0, -11:].tolist()
    return int("".join(str(t) for t in result_digits))


 def build_model():
    """AdderBoard API: returns (model, metadata_dict)."""
    model = TinyAdder1LM().eval()
    metadata = {
        "name": "TinyAdder-1",
        "author": "JagNL",
        "params": 1,
        "architecture": "2L decoder, d=5→16, 5h+1h, ALiBi slope=log(BASE)",
        "tricks": [
            "Single parameter BASE=10 (the number base) — all weights derived",
            "K_WEIGHT = BASE×(BASE²−4), K_BIAS = −BASE³, V_W1 = 1/BASE",
            "Embedding: digit[i] = i×BASE, special flags = unit constants",
            "ALiBi slope = log(BASE) for base-N positional weighting",
            "DIGIT_OFFSET = ½ is a structural constant (not BASE-dependent)",
            "Gated ReLU FFN with positional place-value scaling (BASE^N)",
            "V-shape ReLU (|x|) for digit discrimination",
            "Parabolic logit decode: -(h/scale)²×100",
            "Constructive proof: for this architecture, all weights derive from one scalar",
            "float64 throughout for numerical stability",
        ],
    }
    return model, metadata


 if __name__ == "__main__":
    import random, time

    model, meta = build_model()
    print(f"Model: {meta['name']}")
    print(f"Author: {meta['author']}")
    print(f"Parameters (unique): {meta['params']}")
    print(f"Architecture: {meta['architecture']}")
    print()

    # Show the single parameter
    print("Registered nn.Parameters:")
    for name, p in model.named_parameters():
        print(f"  {name} = {p.item()}")
    print()

    print("Sanity checks:")
    for a, b in [
        (0, 0), (1, 1), (5, 7), (99, 1), (999, 1), (9999999999, 1),
        (5555555555, 5555555555), (1234567890, 9876543210),
        (9999999999, 9999999999),
    ]:
        r = add(model, a, b)
        e = a + b
        print(("✓" if r == e else "✗"), a, "+", b, "=", r, "(expected", e, ")")

    print("\nFull verification (10K random, seed=2025)...")
    rng = random.Random(2025)
    edge_cases = [
        (0, 0), (0, 1), (9999999999, 0), (9999999999, 1), (9999999999, 9999999999),
        (5000000000, 5000000000), (1111111111, 8888888889),
        (1234567890, 9876543210), (9999999999, 9999999999), (1, 9999999999),
    ]
    random_cases = [(rng.randint(0, 9999999999), rng.randint(0, 9999999999)) for _ in range(10000)]
    all_cases = edge_cases + random_cases

    correct = 0
    start = time.time()
    for i, (a, b) in enumerate(all_cases):
        if add(model, a, b) == a + b:
            correct += 1
        if (i + 1) % 2000 == 0:
            print(f"  Progress: {i+1}/{len(all_cases)} ({correct}/{i+1} correct)")
    elapsed = time.time() - start
    acc = correct / len(all_cases) * 100
    print(f"\nResults: {correct}/{len(all_cases)} ({acc:.2f}%)")
    print(f"Time: {elapsed:.1f}s")
    print(f"Status: {'QUALIFIED ✓' if acc >= 99 else 'NOT QUALIFIED ✗'}")
	#!/usr/bin/env python3
	"""
	TinyAdder-1: 1-parameter hand-coded transformer for 10-digit addition.

	AdderBoard submission — hand-coded weights (constructive proof).
	Architecture: 2L decoder, d=5→16, 5h+1h, ALiBi slope=log(BASE).

	The single parameter is BASE (=10), the number base.
	All weights are deterministically derived from BASE:
	K_WEIGHT = BASE × (BASE² − 4) = 960
	K_BIAS = −BASE³ = −1000
	V_W1 = 1 / BASE = 0.1
	Embedding = digit[i] = i × BASE; special flags = 1
	ALiBi = slope log(BASE)

	DIGIT_OFFSET (½) is a structural constant (digit-centering midpoint),
	analogous to how ½ appears in sinusoidal PE formulas — it is not a
	tunable parameter and does not depend on BASE.

	Constructive proof that, for this fixed architecture, all learned weights
	can be deterministically derived from a single scalar: the number base.
	"""
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	# === Constants (structural, not parameters) ===
	NUM_DIGITS = 10
	TOKENS = [str(i) for i in range(NUM_DIGITS)] + ["=", "<bos>", "<eos>", "+"]
	VOCAB_SIZE = len(TOKENS) # 14
	BOS_ID, EOS_ID, EQ_ID, PLUS_ID = 11, 12, 10, 13

	# Dimension assignments (architectural layout, not parameters)
	EQ_DIM, SPECIAL_DIM, DIGIT_DIM, COUNT_DIM, SCALE_DIM = 0, 1, 2, 3, 4
	EMBEDDING_DIM = 5
	LAYER0_HEADS = 5
	ADJUSTMENT_HEAD = 3
	SCALE_HEAD = 4
	CANDIDATES_START = 5
	DIGIT_POS_DIM = 15
	LAYER1_D_MODEL = 16

	# Structural constant: digit centering midpoint (like π, e, ½ in PE formulas)
	DIGIT_OFFSET = 0.5


	def softmax1(x, dim=-1):
	"""Softmax with +1 in denominator (selective gating)."""
	exp_x = x.exp()
	return exp_x / (1 + exp_x.sum(dim=dim, keepdim=True))


	def apply_alibi(seq_len, n_heads, alibi_slope, device, dtype=torch.float64):
	"""ALiBi positional bias with slope derived from BASE (tensor-native)."""
	pos = torch.arange(seq_len, device=device, dtype=dtype)
	rel_pos = pos.unsqueeze(0) - pos.unsqueeze(1)
	slopes = torch.zeros(n_heads, dtype=dtype, device=device)
	slopes[ADJUSTMENT_HEAD] = alibi_slope
	return slopes.unsqueeze(1).unsqueeze(2) * rel_pos.unsqueeze(0)


	def pad_to(x, d):
	if x.size(-1) >= d:
	return x[..., :d]
	return torch.cat(
	[x, torch.zeros(*x.shape[:-1], d - x.size(-1), dtype=x.dtype, device=x.device)],
	dim=-1,
	)


	class TinyAdder1LM(nn.Module):
	"""
	1-parameter autoregressive transformer for 10-digit addition.

	Single parameter: BASE = 10.0 (the number base).
	All attention weights, value projections, FFN coefficients, and embedding
	entries are deterministic functions of BASE. Changing BASE would produce
	a transformer for addition in a different number base.

	Architecture:
	- 2 transformer layers with causal masking
	- Layer 0: 5-head attention with ALiBi slope=log(BASE)
	- Layer 1: 1-head uniform attention for carry averaging
	- Gated ReLU FFNs with positional place-value scaling
	- Parabolic logit decode (no addition-specific logic)
	- float64 throughout
	"""

	def __init__(self):
	super().__init__()
	d = torch.float64

	# ═══════════════════════════════════════════════════════
	# THE SINGLE PARAMETER: the number base
	# ═══════════════════════════════════════════════════════
	self.BASE = nn.Parameter(torch.tensor(10.0, dtype=d), requires_grad=False)

	# Embedding is computed on-the-fly from BASE in forward() —
	# no stored table, no buffer, unambiguously derived.

	@torch.inference_mode()
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""
	x: [B, T] token ids
	returns logits: [B, T, VOCAB_SIZE]
	"""
	B, T = x.shape
	device = x.device
	d = torch.float64

	# ═══════════════════════════════════════════════════════
	# DERIVE ALL WEIGHTS FROM BASE
	# ═══════════════════════════════════════════════════════
	BASE = self.BASE

	# Core derived weights
	K_WEIGHT = BASE * (BASE ** 2 - 4) # 960: attention key scaling
	K_BIAS = -(BASE ** 3) # -1000: attention key bias
	V_W1 = 1.0 / BASE # 0.1: value projection weight

	# Higher-order derived scales
	FINAL_SCALE = BASE ** 2 # 100
	V_SHAPE_SCALE = BASE ** 4 # 10000
	PLACE_SCALE = BASE ** NUM_DIGITS # 10^10

	# Composite derived weights
	V_W2 = -(BASE + 1) * V_W1 # -1.1
	V_BIAS_SHIFT = BASE * (1 + DIGIT_OFFSET) # 15.0

	K_SPECIAL_SCORE = K_WEIGHT + K_BIAS # -40
	V_PROJ_SCALE = torch.exp(K_SPECIAL_SCORE - torch.log(BASE))

	# Attention projections (all from BASE)
	k0_weight = K_WEIGHT
	k0_bias = K_BIAS
	v0_w1 = V_W1 / V_PROJ_SCALE
	v0_w2 = V_W2 / V_PROJ_SCALE

	# ALiBi slope = log(BASE) — positional encoding derived from BASE
	alibi_slope = torch.log(BASE)

	# L0 FFN up-projection values (place-value scaling from BASE)
	offsets = torch.arange(NUM_DIGITS, dtype=d, device=device) + DIGIT_OFFSET # [0.5, 1.5, ..., 9.5]
	pv = offsets * PLACE_SCALE * FINAL_SCALE
	up0_vals = torch.cat([pv, PLACE_SCALE.unsqueeze(0)]) # [11]

	# === Embed on-the-fly from BASE (no stored table, no buffers) ===
	token_ids = x.to(dtype=d) # [B, T]
	digit_mask = (x >= 0) & (x <= 9)
	eq_mask = (x == EQ_ID)
	bos_mask = (x == BOS_ID)
	plus_mask = (x == PLUS_ID)

	h = torch.zeros(B, T, EMBEDDING_DIM, dtype=d, device=device)
	h[..., DIGIT_DIM] = torch.where(digit_mask, token_ids * BASE, 0.0)
	h[..., EQ_DIM] = eq_mask.to(dtype=d)
	h[..., SPECIAL_DIM] = (eq_mask \| bos_mask \| plus_mask).to(dtype=d)

	# === LAYER 0 ATTENTION (causal, 5 heads, ALiBi) ===
	q = torch.ones(B, T, LAYER0_HEADS, dtype=d, device=device)
	k = torch.zeros(B, T, LAYER0_HEADS, dtype=d, device=device)
	k[..., ADJUSTMENT_HEAD] = h[..., SPECIAL_DIM] * k0_weight + k0_bias

	v = torch.zeros(B, T, LAYER0_HEADS, dtype=d, device=device)
	v[..., ADJUSTMENT_HEAD] = h[..., SPECIAL_DIM] * v0_w1 + h[..., EQ_DIM] * v0_w2
	v[..., SCALE_HEAD] = h[..., EQ_DIM] * 1.0 # unit scaling

	q = q.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)
	k = k.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)
	v = v.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)

	scores = torch.matmul(q, k.transpose(-2, -1))
	scores = scores + apply_alibi(T, LAYER0_HEADS, alibi_slope, device=device).unsqueeze(0)

	causal = torch.triu(torch.ones(T, T, device=device, dtype=torch.bool), 1)
	scores = scores.masked_fill(causal, float("-inf"))

	attn = softmax1(scores, dim=-1)
	h = h + torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1)

	# === L0 FFN (gated ReLU with place-value scaling) ===
	gate_in = torch.zeros(B, T, 11, dtype=d, device=device)
	gate_in[..., :NUM_DIGITS] = h[..., SCALE_DIM:SCALE_DIM + 1]
	gate_in[..., NUM_DIGITS] = h[..., DIGIT_DIM]
	gate_out = F.relu(gate_in)

	up_out = h[..., COUNT_DIM:COUNT_DIM + 1] * up0_vals
	ffn_hidden = gate_out * up_out

	h = pad_to(h, LAYER1_D_MODEL)
	h[..., 5:16] = h[..., 5:16] + ffn_hidden

	# === LAYER 1 ATTENTION (uniform, carry averaging) ===
	q = torch.zeros(B, T, 1, dtype=d, device=device)
	k = torch.zeros(B, T, 1, dtype=d, device=device)

	v_weight = torch.zeros(LAYER1_D_MODEL, dtype=d, device=device)
	v_weight[DIGIT_POS_DIM] = FINAL_SCALE
	v = (h * v_weight).sum(dim=-1, keepdim=True) + V_BIAS_SHIFT

	q = q.view(B, T, 1, 1).transpose(1, 2)
	k = k.view(B, T, 1, 1).transpose(1, 2)
	v = v.view(B, T, 1, 1).transpose(1, 2)

	scores = torch.matmul(q, k.transpose(-2, -1))
	scores = scores.masked_fill(causal, float("-inf"))
	attn = softmax1(scores, dim=-1)
	h = h + torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1)

	# === L1 FFN (V-shape ReLU for digit discrimination) ===
	candidates = h[..., CANDIDATES_START:CANDIDATES_START + NUM_DIGITS]
	gate_pos = F.relu(candidates * V_SHAPE_SCALE)
	gate_neg = F.relu(candidates * -V_SHAPE_SCALE)
	ffn_out = (gate_pos + gate_neg) * FINAL_SCALE

	h = pad_to(h, NUM_DIGITS)
	h = h + ffn_out

	# === Parabolic logit decode ===
	# Correct digit has minimum h value (≈0), wrong digits >> 0
	# -(h/scale)² maps min→max for argmax decoding
	h_abs = h.abs()
	scale = h_abs.max(dim=-1, keepdim=True).values.clamp(min=1.0)
	digit_logits = -(h / scale) ** 2 * 100

	logits = torch.full((B, T, VOCAB_SIZE), -1e9, dtype=torch.float64, device=device)
	logits[..., 0:10] = digit_logits

	return logits


	# === Generic autoregressive decoding (no addition-specific logic) ===

	def decode_greedy(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, eos_token_id: int \| None = None):
	"""Standard greedy decoding — works for any causal LM."""
	x = input_ids
	for _ in range(max_new_tokens):
	logits = model(x)
	next_id = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
	x = torch.cat([x, next_id.to(dtype=torch.long)], dim=1)
	if eos_token_id is not None and torch.all(next_id.squeeze(-1) == eos_token_id):
	break
	return x


	def add(model: nn.Module, a: int, b: int) -> int:
	"""Format input and decode — no addition logic, just tokenization."""
	s = f"{a:010d}+{b:010d}="
	tokens = [BOS_ID]
	for ch in s:
	if ch == "+":
	tokens.append(PLUS_ID)
	elif ch == "=":
	tokens.append(EQ_ID)
	else:
	tokens.append(int(ch))

	x = torch.tensor([tokens], dtype=torch.long)
	out = decode_greedy(model, x, max_new_tokens=11, eos_token_id=None)

	result_digits = out[0, -11:].tolist()
	return int("".join(str(t) for t in result_digits))


	def build_model():
	"""AdderBoard API: returns (model, metadata_dict)."""
	model = TinyAdder1LM().eval()
	metadata = {
	"name": "TinyAdder-1",
	"author": "JagNL",
	"params": 1,
	"architecture": "2L decoder, d=5→16, 5h+1h, ALiBi slope=log(BASE)",
	"tricks": [
	"Single parameter BASE=10 (the number base) — all weights derived",
	"K_WEIGHT = BASE×(BASE²−4), K_BIAS = −BASE³, V_W1 = 1/BASE",
	"Embedding: digit[i] = i×BASE, special flags = unit constants",
	"ALiBi slope = log(BASE) for base-N positional weighting",
	"DIGIT_OFFSET = ½ is a structural constant (not BASE-dependent)",
	"Gated ReLU FFN with positional place-value scaling (BASE^N)",
	"V-shape ReLU (\|x\|) for digit discrimination",
	"Parabolic logit decode: -(h/scale)²×100",
	"Constructive proof: for this architecture, all weights derive from one scalar",
	"float64 throughout for numerical stability",
	],
	}
	return model, metadata


	if __name__ == "__main__":
	import random, time

	model, meta = build_model()
	print(f"Model: {meta['name']}")
	print(f"Author: {meta['author']}")
	print(f"Parameters (unique): {meta['params']}")
	print(f"Architecture: {meta['architecture']}")
	print()

	# Show the single parameter
	print("Registered nn.Parameters:")
	for name, p in model.named_parameters():
	print(f" {name} = {p.item()}")
	print()

	print("Sanity checks:")
	for a, b in [
	(0, 0), (1, 1), (5, 7), (99, 1), (999, 1), (9999999999, 1),
	(5555555555, 5555555555), (1234567890, 9876543210),
	(9999999999, 9999999999),
	]:
	r = add(model, a, b)
	e = a + b
	print(("✓" if r == e else "✗"), a, "+", b, "=", r, "(expected", e, ")")

	print("\nFull verification (10K random, seed=2025)...")
	rng = random.Random(2025)
	edge_cases = [
	(0, 0), (0, 1), (9999999999, 0), (9999999999, 1), (9999999999, 9999999999),
	(5000000000, 5000000000), (1111111111, 8888888889),
	(1234567890, 9876543210), (9999999999, 9999999999), (1, 9999999999),
	]
	random_cases = [(rng.randint(0, 9999999999), rng.randint(0, 9999999999)) for _ in range(10000)]
	all_cases = edge_cases + random_cases

	correct = 0
	start = time.time()
	for i, (a, b) in enumerate(all_cases):
	if add(model, a, b) == a + b:
	correct += 1
	if (i + 1) % 2000 == 0:
	print(f" Progress: {i+1}/{len(all_cases)} ({correct}/{i+1} correct)")
	elapsed = time.time() - start
	acc = correct / len(all_cases) * 100
	print(f"\nResults: {correct}/{len(all_cases)} ({acc:.2f}%)")
	print(f"Time: {elapsed:.1f}s")
	print(f"Status: {'QUALIFIED ✓' if acc >= 99 else 'NOT QUALIFIED ✗'}")
No results found