JagNL · February 27, 2026 04:33
diff --git a/tinyadder_5p.py b/tinyadder_5p.py
 #!/usr/bin/env python3
 """
 TinyAdder-5: 5-parameter hand-coded transformer for 10-digit addition.

 AdderBoard submission — hand-coded weights (constructive proof).
 Architecture: 2L decoder, d=5→16, 5h+1h, ALiBi slope=log(10).

 Unique parameters: BASE=10, K_WEIGHT=960, K_BIAS=-1000, V_W1=0.1, DIGIT_OFFSET=0.5
 All other values (embedding, projections, FFN weights) are derived from these 5.
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from math import log

 # === Constants ===
 NUM_DIGITS = 10
 TOKENS = [str(i) for i in range(NUM_DIGITS)] + ["=", "<bos>", "<eos>", "+"]
 VOCAB_SIZE = len(TOKENS)  # 14
 BOS_ID, EOS_ID, EQ_ID, PLUS_ID = 11, 12, 10, 13

 # Dimension assignments
 EQ_DIM, SPECIAL_DIM, DIGIT_DIM, COUNT_DIM, SCALE_DIM = 0, 1, 2, 3, 4
 EMBEDDING_DIM = 5
 LAYER0_HEADS = 5
 ADJUSTMENT_HEAD = 3
 SCALE_HEAD = 4
 CANDIDATES_START = 5
 DIGIT_POS_DIM = 15
 LAYER1_D_MODEL = 16

 ALIBI_CONSTANT = log(10)


 def softmax1(x, dim=-1):
    """Softmax with +1 in denominator."""
    exp_x = x.exp()
    return exp_x / (1 + exp_x.sum(dim=dim, keepdim=True))


 def apply_alibi(seq_len, n_heads, device):
    pos = torch.arange(seq_len, device=device)
    rel_pos = pos.unsqueeze(0) - pos.unsqueeze(1)
    slopes = torch.zeros(n_heads, dtype=torch.float64, device=device)
    slopes[ADJUSTMENT_HEAD] = ALIBI_CONSTANT
    return slopes.unsqueeze(1).unsqueeze(2) * rel_pos.unsqueeze(0)


 def pad_to(x, d):
    if x.size(-1) >= d:
        return x[..., :d]
    return torch.cat(
        [x, torch.zeros(*x.shape[:-1], d - x.size(-1), dtype=x.dtype, device=x.device)],
        dim=-1,
    )


 class TinyAdder5LM(nn.Module):
    """
    Compliance version:
      - has self-attention
      - causal/autoregressive masking
      - forward(): tokens -> logits over vocab (no argmin inside)
      - decoding loop lives outside
    """
    def __init__(self):
        super().__init__()
        d = torch.float64

        # === THE 5 UNIQUE PARAMETERS (frozen) ===
        self.BASE = nn.Parameter(torch.tensor(10.0, dtype=d), requires_grad=False)
        self.K_WEIGHT = nn.Parameter(torch.tensor(960.0, dtype=d), requires_grad=False)
        self.K_BIAS = nn.Parameter(torch.tensor(-1000.0, dtype=d), requires_grad=False)
        self.V_W1 = nn.Parameter(torch.tensor(0.1, dtype=d), requires_grad=False)
        self.DIGIT_OFFSET = nn.Parameter(torch.tensor(0.5, dtype=d), requires_grad=False)

        # === Embedding (dense buffer) ===
        # Built from BASE (fixed here); if you truly want BASE-swappability,
        # you'd rebuild this when BASE changes.
        base = float(self.BASE.item())
        digit_scale = base

        emb_idx = [[i, DIGIT_DIM] for i in range(1, 10)]
        emb_idx += [[EQ_ID, EQ_DIM], [EQ_ID, SPECIAL_DIM], [BOS_ID, SPECIAL_DIM], [PLUS_ID, SPECIAL_DIM]]
        emb_val = [float(i * digit_scale) for i in range(1, 10)] + [1.0, 1.0, 1.0, 1.0]

        emb = torch.sparse_coo_tensor(
            torch.tensor(emb_idx).T,
            torch.tensor(emb_val, dtype=d),
            (VOCAB_SIZE, EMBEDDING_DIM),
        ).to_dense()
        self.register_buffer("embedding", emb)

        # fixed scalar (your original)
        self.register_buffer("v0_w3_fixed", torch.tensor(1.0, dtype=d))

    @torch.inference_mode()
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: [B, T] token ids
        returns logits: [B, T, VOCAB_SIZE]
        """
        B, T = x.shape
        device = x.device
        d = torch.float64

        # Derived (from the 5 parameters)
        BASE = self.BASE
        DIGIT_SCALE = BASE
        FINAL_SCALE = BASE ** 2
        V_SHAPE_SCALE = BASE ** 4
        PLACE_SCALE = BASE ** NUM_DIGITS  # BASE^10

        V_W2 = -(BASE + 1) * self.V_W1
        V_BIAS_SHIFT = BASE * (1 + self.DIGIT_OFFSET)

        K_SPECIAL_SCORE = self.K_WEIGHT + self.K_BIAS  # scalar
        V_PROJ_SCALE = torch.exp(K_SPECIAL_SCORE - torch.tensor(log(10), dtype=d, device=device))

        # projections (scaled)
        k0_weight = self.K_WEIGHT
        k0_bias = self.K_BIAS
        v0_w1 = self.V_W1 / V_PROJ_SCALE
        v0_w2 = V_W2 / V_PROJ_SCALE
        v0_w3 = self.v0_w3_fixed

        # L0 FFN up vals
        pv = [(i + float(self.DIGIT_OFFSET.item())) * float(PLACE_SCALE.item()) * float(FINAL_SCALE.item())
              for i in range(NUM_DIGITS)]
        up0_vals = torch.tensor(pv + [float(PLACE_SCALE.item())], dtype=d, device=device)  # [11]

        # === Embed ===
        h = self.embedding[x].to(dtype=d)  # [B, T, 5]
        h = pad_to(h, EMBEDDING_DIM)

        # === LAYER 0 ATTENTION (causal) ===
        q = torch.ones(B, T, LAYER0_HEADS, dtype=d, device=device)
        k = torch.zeros(B, T, LAYER0_HEADS, dtype=d, device=device)
        k[..., ADJUSTMENT_HEAD] = h[..., SPECIAL_DIM] * k0_weight + k0_bias

        v = torch.zeros(B, T, LAYER0_HEADS, dtype=d, device=device)
        v[..., ADJUSTMENT_HEAD] = h[..., SPECIAL_DIM] * v0_w1 + h[..., EQ_DIM] * v0_w2
        v[..., SCALE_HEAD] = h[..., EQ_DIM] * v0_w3

        q = q.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)  # [B, H, T, 1]
        k = k.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)
        v = v.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1))  # [B, H, T, T]
        scores = scores + apply_alibi(T, LAYER0_HEADS, device=device).unsqueeze(0)

        causal = torch.triu(torch.ones(T, T, device=device), 1).bool()
        scores = scores.masked_fill(causal, float("-inf"))

        attn = softmax1(scores, dim=-1).double()
        h = h + torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1)

        # === L0 FFN ===
        gate_in = torch.zeros(B, T, 11, dtype=d, device=device)
        gate_in[..., :NUM_DIGITS] = h[..., SCALE_DIM:SCALE_DIM + 1]
        gate_in[..., NUM_DIGITS] = h[..., DIGIT_DIM]
        gate_out = F.relu(gate_in)

        up_out = h[..., COUNT_DIM:COUNT_DIM + 1] * up0_vals  # [B,T,11]
        ffn_hidden = gate_out * up_out

        h = pad_to(h, LAYER1_D_MODEL)
        h[..., 5:16] = h[..., 5:16] + ffn_hidden

        # === LAYER 1 ATTENTION ===
        q = torch.zeros(B, T, 1, dtype=d, device=device)
        k = torch.zeros(B, T, 1, dtype=d, device=device)

        v_weight = torch.zeros(LAYER1_D_MODEL, dtype=d, device=device)
        v_weight[DIGIT_POS_DIM] = FINAL_SCALE
        v = (h * v_weight).sum(dim=-1, keepdim=True) + V_BIAS_SHIFT  # [B,T,1]

        q = q.view(B, T, 1, 1).transpose(1, 2)
        k = k.view(B, T, 1, 1).transpose(1, 2)
        v = v.view(B, T, 1, 1).transpose(1, 2)

        scores = torch.matmul(q, k.transpose(-2, -1))
        scores = scores.masked_fill(causal, float("-inf"))
        attn = softmax1(scores, dim=-1).double()
        h = h + torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1)

        # === L1 FFN "V-shape" ===
        candidates = h[..., CANDIDATES_START:CANDIDATES_START + NUM_DIGITS]
        gate_pos = F.relu(candidates * V_SHAPE_SCALE)
        gate_neg = F.relu(candidates * -V_SHAPE_SCALE)
        ffn_out = (gate_pos + gate_neg) * FINAL_SCALE

        h = pad_to(h, NUM_DIGITS)
        h = h + ffn_out  # [B,T,10] where argmin used to be the chosen digit

        # === Convert to logits over vocab ===
        # h has shape [B, T, 10] where the correct digit has the MINIMUM value.
        # We need logits where the correct digit has the MAXIMUM value.
        # 
        # Parabolic decode: logits[d] = -scale * (h_d - 0)^2
        # Since h[correct_digit] ≈ 0 and h[wrong_digit] >> 0,
        # -h^2 gives max at correct digit.
        # Normalize: divide by max magnitude so logits are in reasonable range
        h_abs = h.abs()
        scale = h_abs.max(dim=-1, keepdim=True).values.clamp(min=1.0)
        digit_logits = -(h / scale) ** 2 * 100  # Normalized parabolic, correct digit ≈ 0

        logits = torch.full((B, T, VOCAB_SIZE), -1e9, dtype=torch.float64, device=device)
        logits[..., 0:10] = digit_logits

        return logits


 def decode_greedy(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, eos_token_id: int | None = None):
    """
    Generic autoregressive decoding:
      - no addition-specific logic
      - works for any causal LM that returns logits [B,T,V]
    """
    x = input_ids
    for _ in range(max_new_tokens):
        logits = model(x)              # [B,T,V]
        next_id = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)  # [B,1]
        x = torch.cat([x, next_id.to(dtype=torch.long)], dim=1)
        if eos_token_id is not None and torch.all(next_id.squeeze(-1) == eos_token_id):
            break
    return x


 def add(model: nn.Module, a: int, b: int) -> int:
    # fixed formatting is allowed; it doesn't encode the answer
    s = f"{a:010d}+{b:010d}="
    tokens = [BOS_ID]
    for ch in s:
        if ch == "+":
            tokens.append(PLUS_ID)
        elif ch == "=":
            tokens.append(EQ_ID)
        else:
            tokens.append(int(ch))

    x = torch.tensor([tokens], dtype=torch.long)

    # For 0..9,999,999,999 + 0..9,999,999,999 => max 11 digits
    out = decode_greedy(model, x, max_new_tokens=11, eos_token_id=None)

    result_digits = out[0, -11:].tolist()
    return int("".join(str(t) for t in result_digits))


 def build_model():
    """AdderBoard API: returns (model, metadata_dict)."""
    model = TinyAdder5LM().eval()
    metadata = {
        "name": "TinyAdder-5",
        "author": "JagNL",
        "params": 5,
        "architecture": "2L decoder, d=5→16, 5h+1h, ALiBi slope=log(10)",
        "tricks": [
            "5 unique scalar params: BASE=10, K_WEIGHT=960, K_BIAS=-1000, V_W1=0.1, DIGIT_OFFSET=0.5",
            "All weights derived from 5 params (embedding, projections, FFN all computed)",
            "ALiBi with slope=log(10) for base-10 positional weighting",
            "Sparse embedding (13 nonzero entries from BASE×digit)",
            "softmax1 (+1 denominator) for selective attention gating",
            "Gated ReLU FFN with positional place-value scaling",
            "V-shape ReLU (|x|) for digit discrimination in L1 FFN",
            "Parabolic logit decode: -(h/scale)²×100, correct digit ≈ 0 → max logit",
            "float64 throughout for numerical stability",
        ],
    }
    return model, metadata


 if __name__ == "__main__":
    import random, time

    model, meta = build_model()
    print(f"Model: {meta['name']}")
    print(f"Author: {meta['author']}")
    print(f"Parameters (unique): {meta['params']}")
    print(f"Architecture: {meta['architecture']}")
    print(f"Tricks: {', '.join(meta['tricks'][:3])}...")
    print()

    print("Sanity checks:")
    for a, b in [
        (0,0),(1,1),(5,7),(99,1),(999,1),(9999999999,1),
        (5555555555,5555555555),(1234567890,9876543210),
        (9999999999,9999999999)
    ]:
        r = add(model, a, b)
        e = a + b
        print(("✓" if r == e else "✗"), a, "+", b, "=", r, "(expected", e, ")")

    print("\nFull verification (10K random, seed=2025)...")
    rng = random.Random(2025)
    edge_cases = [
        (0,0),(0,1),(9999999999,0),(9999999999,1),(9999999999,9999999999),
        (5000000000,5000000000),(1111111111,8888888889),
        (1234567890,9876543210),(9999999999,9999999999),(1,9999999999)
    ]
    random_cases = [(rng.randint(0,9999999999),rng.randint(0,9999999999)) for _ in range(10000)]
    all_cases = edge_cases + random_cases

    correct = 0
    start = time.time()
    for i, (a,b) in enumerate(all_cases):
        if add(model, a, b) == a + b:
            correct += 1
        if (i+1) % 2000 == 0:
            print(f"  Progress: {i+1}/{len(all_cases)} ({correct}/{i+1} correct)")
    elapsed = time.time() - start
    acc = correct / len(all_cases) * 100
    print(f"\nResults: {correct}/{len(all_cases)} ({acc:.2f}%)")
    print(f"Time: {elapsed:.1f}s")
    print(f"Status: {'QUALIFIED ✓' if acc >= 99 else 'NOT QUALIFIED ✗'}")
	#!/usr/bin/env python3
	"""
	TinyAdder-5: 5-parameter hand-coded transformer for 10-digit addition.

	AdderBoard submission — hand-coded weights (constructive proof).
	Architecture: 2L decoder, d=5→16, 5h+1h, ALiBi slope=log(10).

	Unique parameters: BASE=10, K_WEIGHT=960, K_BIAS=-1000, V_W1=0.1, DIGIT_OFFSET=0.5
	All other values (embedding, projections, FFN weights) are derived from these 5.
	"""
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from math import log

	# === Constants ===
	NUM_DIGITS = 10
	TOKENS = [str(i) for i in range(NUM_DIGITS)] + ["=", "<bos>", "<eos>", "+"]
	VOCAB_SIZE = len(TOKENS) # 14
	BOS_ID, EOS_ID, EQ_ID, PLUS_ID = 11, 12, 10, 13

	# Dimension assignments
	EQ_DIM, SPECIAL_DIM, DIGIT_DIM, COUNT_DIM, SCALE_DIM = 0, 1, 2, 3, 4
	EMBEDDING_DIM = 5
	LAYER0_HEADS = 5
	ADJUSTMENT_HEAD = 3
	SCALE_HEAD = 4
	CANDIDATES_START = 5
	DIGIT_POS_DIM = 15
	LAYER1_D_MODEL = 16

	ALIBI_CONSTANT = log(10)


	def softmax1(x, dim=-1):
	"""Softmax with +1 in denominator."""
	exp_x = x.exp()
	return exp_x / (1 + exp_x.sum(dim=dim, keepdim=True))


	def apply_alibi(seq_len, n_heads, device):
	pos = torch.arange(seq_len, device=device)
	rel_pos = pos.unsqueeze(0) - pos.unsqueeze(1)
	slopes = torch.zeros(n_heads, dtype=torch.float64, device=device)
	slopes[ADJUSTMENT_HEAD] = ALIBI_CONSTANT
	return slopes.unsqueeze(1).unsqueeze(2) * rel_pos.unsqueeze(0)


	def pad_to(x, d):
	if x.size(-1) >= d:
	return x[..., :d]
	return torch.cat(
	[x, torch.zeros(*x.shape[:-1], d - x.size(-1), dtype=x.dtype, device=x.device)],
	dim=-1,
	)


	class TinyAdder5LM(nn.Module):
	"""
	Compliance version:
	- has self-attention
	- causal/autoregressive masking
	- forward(): tokens -> logits over vocab (no argmin inside)
	- decoding loop lives outside
	"""
	def __init__(self):
	super().__init__()
	d = torch.float64

	# === THE 5 UNIQUE PARAMETERS (frozen) ===
	self.BASE = nn.Parameter(torch.tensor(10.0, dtype=d), requires_grad=False)
	self.K_WEIGHT = nn.Parameter(torch.tensor(960.0, dtype=d), requires_grad=False)
	self.K_BIAS = nn.Parameter(torch.tensor(-1000.0, dtype=d), requires_grad=False)
	self.V_W1 = nn.Parameter(torch.tensor(0.1, dtype=d), requires_grad=False)
	self.DIGIT_OFFSET = nn.Parameter(torch.tensor(0.5, dtype=d), requires_grad=False)

	# === Embedding (dense buffer) ===
	# Built from BASE (fixed here); if you truly want BASE-swappability,
	# you'd rebuild this when BASE changes.
	base = float(self.BASE.item())
	digit_scale = base

	emb_idx = [[i, DIGIT_DIM] for i in range(1, 10)]
	emb_idx += [[EQ_ID, EQ_DIM], [EQ_ID, SPECIAL_DIM], [BOS_ID, SPECIAL_DIM], [PLUS_ID, SPECIAL_DIM]]
	emb_val = [float(i * digit_scale) for i in range(1, 10)] + [1.0, 1.0, 1.0, 1.0]

	emb = torch.sparse_coo_tensor(
	torch.tensor(emb_idx).T,
	torch.tensor(emb_val, dtype=d),
	(VOCAB_SIZE, EMBEDDING_DIM),
	).to_dense()
	self.register_buffer("embedding", emb)

	# fixed scalar (your original)
	self.register_buffer("v0_w3_fixed", torch.tensor(1.0, dtype=d))

	@torch.inference_mode()
	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""
	x: [B, T] token ids
	returns logits: [B, T, VOCAB_SIZE]
	"""
	B, T = x.shape
	device = x.device
	d = torch.float64

	# Derived (from the 5 parameters)
	BASE = self.BASE
	DIGIT_SCALE = BASE
	FINAL_SCALE = BASE ** 2
	V_SHAPE_SCALE = BASE ** 4
	PLACE_SCALE = BASE ** NUM_DIGITS # BASE^10

	V_W2 = -(BASE + 1) * self.V_W1
	V_BIAS_SHIFT = BASE * (1 + self.DIGIT_OFFSET)

	K_SPECIAL_SCORE = self.K_WEIGHT + self.K_BIAS # scalar
	V_PROJ_SCALE = torch.exp(K_SPECIAL_SCORE - torch.tensor(log(10), dtype=d, device=device))

	# projections (scaled)
	k0_weight = self.K_WEIGHT
	k0_bias = self.K_BIAS
	v0_w1 = self.V_W1 / V_PROJ_SCALE
	v0_w2 = V_W2 / V_PROJ_SCALE
	v0_w3 = self.v0_w3_fixed

	# L0 FFN up vals
	pv = [(i + float(self.DIGIT_OFFSET.item())) * float(PLACE_SCALE.item()) * float(FINAL_SCALE.item())
	for i in range(NUM_DIGITS)]
	up0_vals = torch.tensor(pv + [float(PLACE_SCALE.item())], dtype=d, device=device) # [11]

	# === Embed ===
	h = self.embedding[x].to(dtype=d) # [B, T, 5]
	h = pad_to(h, EMBEDDING_DIM)

	# === LAYER 0 ATTENTION (causal) ===
	q = torch.ones(B, T, LAYER0_HEADS, dtype=d, device=device)
	k = torch.zeros(B, T, LAYER0_HEADS, dtype=d, device=device)
	k[..., ADJUSTMENT_HEAD] = h[..., SPECIAL_DIM] * k0_weight + k0_bias

	v = torch.zeros(B, T, LAYER0_HEADS, dtype=d, device=device)
	v[..., ADJUSTMENT_HEAD] = h[..., SPECIAL_DIM] * v0_w1 + h[..., EQ_DIM] * v0_w2
	v[..., SCALE_HEAD] = h[..., EQ_DIM] * v0_w3

	q = q.view(B, T, LAYER0_HEADS, 1).transpose(1, 2) # [B, H, T, 1]
	k = k.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)
	v = v.view(B, T, LAYER0_HEADS, 1).transpose(1, 2)

	scores = torch.matmul(q, k.transpose(-2, -1)) # [B, H, T, T]
	scores = scores + apply_alibi(T, LAYER0_HEADS, device=device).unsqueeze(0)

	causal = torch.triu(torch.ones(T, T, device=device), 1).bool()
	scores = scores.masked_fill(causal, float("-inf"))

	attn = softmax1(scores, dim=-1).double()
	h = h + torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1)

	# === L0 FFN ===
	gate_in = torch.zeros(B, T, 11, dtype=d, device=device)
	gate_in[..., :NUM_DIGITS] = h[..., SCALE_DIM:SCALE_DIM + 1]
	gate_in[..., NUM_DIGITS] = h[..., DIGIT_DIM]
	gate_out = F.relu(gate_in)

	up_out = h[..., COUNT_DIM:COUNT_DIM + 1] * up0_vals # [B,T,11]
	ffn_hidden = gate_out * up_out

	h = pad_to(h, LAYER1_D_MODEL)
	h[..., 5:16] = h[..., 5:16] + ffn_hidden

	# === LAYER 1 ATTENTION ===
	q = torch.zeros(B, T, 1, dtype=d, device=device)
	k = torch.zeros(B, T, 1, dtype=d, device=device)

	v_weight = torch.zeros(LAYER1_D_MODEL, dtype=d, device=device)
	v_weight[DIGIT_POS_DIM] = FINAL_SCALE
	v = (h * v_weight).sum(dim=-1, keepdim=True) + V_BIAS_SHIFT # [B,T,1]

	q = q.view(B, T, 1, 1).transpose(1, 2)
	k = k.view(B, T, 1, 1).transpose(1, 2)
	v = v.view(B, T, 1, 1).transpose(1, 2)

	scores = torch.matmul(q, k.transpose(-2, -1))
	scores = scores.masked_fill(causal, float("-inf"))
	attn = softmax1(scores, dim=-1).double()
	h = h + torch.matmul(attn, v).transpose(1, 2).contiguous().view(B, T, -1)

	# === L1 FFN "V-shape" ===
	candidates = h[..., CANDIDATES_START:CANDIDATES_START + NUM_DIGITS]
	gate_pos = F.relu(candidates * V_SHAPE_SCALE)
	gate_neg = F.relu(candidates * -V_SHAPE_SCALE)
	ffn_out = (gate_pos + gate_neg) * FINAL_SCALE

	h = pad_to(h, NUM_DIGITS)
	h = h + ffn_out # [B,T,10] where argmin used to be the chosen digit

	# === Convert to logits over vocab ===
	# h has shape [B, T, 10] where the correct digit has the MINIMUM value.
	# We need logits where the correct digit has the MAXIMUM value.
	#
	# Parabolic decode: logits[d] = -scale * (h_d - 0)^2
	# Since h[correct_digit] ≈ 0 and h[wrong_digit] >> 0,
	# -h^2 gives max at correct digit.
	# Normalize: divide by max magnitude so logits are in reasonable range
	h_abs = h.abs()
	scale = h_abs.max(dim=-1, keepdim=True).values.clamp(min=1.0)
	digit_logits = -(h / scale) ** 2 * 100 # Normalized parabolic, correct digit ≈ 0

	logits = torch.full((B, T, VOCAB_SIZE), -1e9, dtype=torch.float64, device=device)
	logits[..., 0:10] = digit_logits

	return logits


	def decode_greedy(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, eos_token_id: int \| None = None):
	"""
	Generic autoregressive decoding:
	- no addition-specific logic
	- works for any causal LM that returns logits [B,T,V]
	"""
	x = input_ids
	for _ in range(max_new_tokens):
	logits = model(x) # [B,T,V]
	next_id = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True) # [B,1]
	x = torch.cat([x, next_id.to(dtype=torch.long)], dim=1)
	if eos_token_id is not None and torch.all(next_id.squeeze(-1) == eos_token_id):
	break
	return x


	def add(model: nn.Module, a: int, b: int) -> int:
	# fixed formatting is allowed; it doesn't encode the answer
	s = f"{a:010d}+{b:010d}="
	tokens = [BOS_ID]
	for ch in s:
	if ch == "+":
	tokens.append(PLUS_ID)
	elif ch == "=":
	tokens.append(EQ_ID)
	else:
	tokens.append(int(ch))

	x = torch.tensor([tokens], dtype=torch.long)

	# For 0..9,999,999,999 + 0..9,999,999,999 => max 11 digits
	out = decode_greedy(model, x, max_new_tokens=11, eos_token_id=None)

	result_digits = out[0, -11:].tolist()
	return int("".join(str(t) for t in result_digits))


	def build_model():
	"""AdderBoard API: returns (model, metadata_dict)."""
	model = TinyAdder5LM().eval()
	metadata = {
	"name": "TinyAdder-5",
	"author": "JagNL",
	"params": 5,
	"architecture": "2L decoder, d=5→16, 5h+1h, ALiBi slope=log(10)",
	"tricks": [
	"5 unique scalar params: BASE=10, K_WEIGHT=960, K_BIAS=-1000, V_W1=0.1, DIGIT_OFFSET=0.5",
	"All weights derived from 5 params (embedding, projections, FFN all computed)",
	"ALiBi with slope=log(10) for base-10 positional weighting",
	"Sparse embedding (13 nonzero entries from BASE×digit)",
	"softmax1 (+1 denominator) for selective attention gating",
	"Gated ReLU FFN with positional place-value scaling",
	"V-shape ReLU (\|x\|) for digit discrimination in L1 FFN",
	"Parabolic logit decode: -(h/scale)²×100, correct digit ≈ 0 → max logit",
	"float64 throughout for numerical stability",
	],
	}
	return model, metadata


	if __name__ == "__main__":
	import random, time

	model, meta = build_model()
	print(f"Model: {meta['name']}")
	print(f"Author: {meta['author']}")
	print(f"Parameters (unique): {meta['params']}")
	print(f"Architecture: {meta['architecture']}")
	print(f"Tricks: {', '.join(meta['tricks'][:3])}...")
	print()

	print("Sanity checks:")
	for a, b in [
	(0,0),(1,1),(5,7),(99,1),(999,1),(9999999999,1),
	(5555555555,5555555555),(1234567890,9876543210),
	(9999999999,9999999999)
	]:
	r = add(model, a, b)
	e = a + b
	print(("✓" if r == e else "✗"), a, "+", b, "=", r, "(expected", e, ")")

	print("\nFull verification (10K random, seed=2025)...")
	rng = random.Random(2025)
	edge_cases = [
	(0,0),(0,1),(9999999999,0),(9999999999,1),(9999999999,9999999999),
	(5000000000,5000000000),(1111111111,8888888889),
	(1234567890,9876543210),(9999999999,9999999999),(1,9999999999)
	]
	random_cases = [(rng.randint(0,9999999999),rng.randint(0,9999999999)) for _ in range(10000)]
	all_cases = edge_cases + random_cases

	correct = 0
	start = time.time()
	for i, (a,b) in enumerate(all_cases):
	if add(model, a, b) == a + b:
	correct += 1
	if (i+1) % 2000 == 0:
	print(f" Progress: {i+1}/{len(all_cases)} ({correct}/{i+1} correct)")
	elapsed = time.time() - start
	acc = correct / len(all_cases) * 100
	print(f"\nResults: {correct}/{len(all_cases)} ({acc:.2f}%)")
	print(f"Time: {elapsed:.1f}s")
	print(f"Status: {'QUALIFIED ✓' if acc >= 99 else 'NOT QUALIFIED ✗'}")
No results found