mystix · February 12, 2026 15:39 · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/microgpt.py b/microgpt.py
@@ -152,7 +152,7 @@ def gpt(token_id, pos_id, keys, values):
 num_steps = 500 # number of training steps
 for step in range(num_steps):
 
-    # Take single document, tokenize it, surround it with BOS special token (token id 0) on both sides
+    # Take single document, tokenize it, surround it with BOS special token on both sides
     doc = docs[step % len(docs)]
     tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS]
     n = min(block_size, len(tokens) - 1)

diff --git a/microgpt.py b/microgpt.py
@@ -9,9 +9,7 @@
 import os       # os.path.exists
 import math     # math.log, math.exp
 import random   # random.seed, random.choices, random.gauss, random.shuffle
-
-# Let there be order among chaos
-random.seed(42)
+random.seed(42) # Let there be order among chaos
 
 # Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names)
 if not os.path.exists('input.txt'):
@@ -23,96 +21,55 @@
 print(f"num docs: {len(docs)}")
 
 # Let there be a Tokenizer to translate strings to discrete symbols and back
-chars = ['<BOS>'] + sorted(set(''.join(docs))) # character-level tokenizer with a BOS delimiter
-vocab_size = len(chars)
-stoi = { ch:i for i, ch in enumerate(chars) } # encoding: map string to integer
-itos = { i:ch for i, ch in enumerate(chars) } # decoding: map integer to string
-BOS = stoi['<BOS>']
+uchars = sorted(set(''.join(docs))) # unique characters in the dataset become token ids 0..n-1
+BOS = len(uchars) # token id for the special Beginning of Sequence (BOS) token
+vocab_size = len(uchars) + 1 # total number of unique tokens, +1 is for BOS
 print(f"vocab size: {vocab_size}")
 
-# Let there be an Autograd to apply the chain rule recursively across a computation graph and so
-# calculate the gradients of the loss with respect to model parameters.
+# Let there be an Autograd to apply the chain rule recursively across a computation graph
 class Value:
-    """Stores a single scalar value and its gradient."""
+    """Stores a single scalar value and its gradient, as a node in a computation graph."""
 
-    def __init__(self, data, _children=(), _op=''):
-        self.data = data
-        self.grad = 0
-        self._backward = lambda: None
-        self._prev = set(_children)
-        self._op = _op # the op that produced this node, for graphviz / debugging / etc
+    def __init__(self, data, children=(), local_grads=()):
+        self.data = data                # scalar value of this node calculated during forward pass
+        self.grad = 0                   # derivative of the loss w.r.t. this node, calculated in backward pass
+        self._children = children       # children of this node in the computation graph
+        self._local_grads = local_grads # local derivative of this node w.r.t. its children
 
     def __add__(self, other):
         other = other if isinstance(other, Value) else Value(other)
-        out = Value(self.data + other.data, (self, other), '+')
-        def _backward():
-            self.grad += out.grad
-            other.grad += out.grad
-        out._backward = _backward
-        return out
+        return Value(self.data + other.data, (self, other), (1, 1))
 
     def __mul__(self, other):
         other = other if isinstance(other, Value) else Value(other)
-        out = Value(self.data * other.data, (self, other), '*')
-        def _backward():
-            self.grad += other.data * out.grad
-            other.grad += self.data * out.grad
-        out._backward = _backward
-        return out
-
-    def __pow__(self, other):
-        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
-        out = Value(self.data**other, (self,), f'**{other}')
-        def _backward():
-            self.grad += (other * self.data**(other-1)) * out.grad
-        out._backward = _backward
-        return out
-
-    def log(self):
-        out = Value(math.log(self.data), (self,), 'log')
-        def _backward():
-            self.grad += (1 / self.data) * out.grad
-        out._backward = _backward
-        return out
-
-    def exp(self):
-        out = Value(math.exp(self.data), (self,), 'exp')
-        def _backward():
-            self.grad += out.data * out.grad
-        out._backward = _backward
-        return out
-
-    def relu(self):
-        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')
-        def _backward():
-            self.grad += (out.data > 0) * out.grad
-        out._backward = _backward
-        return out
+        return Value(self.data * other.data, (self, other), (other.data, self.data))
+
+    def __pow__(self, other): return Value(self.data**other, (self,), (other * self.data**(other-1),))
+    def log(self): return Value(math.log(self.data), (self,), (1/self.data,))
+    def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),))
+    def relu(self): return Value(max(0, self.data), (self,), (float(self.data > 0),))
+    def __neg__(self): return self * -1
+    def __radd__(self, other): return self + other
+    def __sub__(self, other): return self + (-other)
+    def __rsub__(self, other): return other + (-self)
+    def __rmul__(self, other): return self * other
+    def __truediv__(self, other): return self * other**-1
+    def __rtruediv__(self, other): return other * self**-1
 
     def backward(self):
-        # topological order all of the children in the graph
         topo = []
         visited = set()
         def build_topo(v):
             if v not in visited:
                 visited.add(v)
-                for child in v._prev:
+                for child in v._children:
                     build_topo(child)
                 topo.append(v)
         build_topo(self)
-        # go one variable at a time and apply the chain rule to get its gradient
         self.grad = 1
         for v in reversed(topo):
-            v._backward()
-
-    def __neg__(self): return self * -1
-    def __radd__(self, other): return self + other
-    def __sub__(self, other): return self + (-other)
-    def __rsub__(self, other): return other + (-self)
-    def __rmul__(self, other): return self * other
-    def __truediv__(self, other): return self * other**-1
-    def __rtruediv__(self, other): return other * self**-1
-    def __repr__(self): return f"Value(data={self.data}, grad={self.grad})"
+            for child, local_grad in zip(v._children, v._local_grads):
+                child.grad += local_grad * v.grad
 
 # Initialize the parameters, to store the knowledge of the model.
 n_embd = 16     # embedding dimension
@@ -195,9 +152,9 @@ def gpt(token_id, pos_id, keys, values):
 num_steps = 500 # number of training steps
 for step in range(num_steps):
 
-    # Take single document, tokenize it, surround it with BOS special token on both sides
+    # Take single document, tokenize it, surround it with BOS special token (token id 0) on both sides
     doc = docs[step % len(docs)]
-    tokens = [BOS] + [stoi[ch] for ch in doc] + [BOS]
+    tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS]
     n = min(block_size, len(tokens) - 1)
 
     # Forward the token sequence through the model, building up the computation graph all the way to the loss.
@@ -215,7 +172,7 @@ def gpt(token_id, pos_id, keys, values):
     loss.backward()
 
     # Adam optimizer update: update the model parameters based on the corresponding gradients.
-    lr_t = learning_rate * (1 - step / num_steps)
+    lr_t = learning_rate * 0.5 * (1 + math.cos(math.pi * step / num_steps)) # cosine learning rate decay
     for i, p in enumerate(params):
         m[i] = beta1 * m[i] + (1 - beta1) * p.grad
         v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2
@@ -227,17 +184,17 @@ def gpt(token_id, pos_id, keys, values):
     print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}")
 
 # Inference: may the model babble back to us
-temperature = 0.6 # in (0, 1], control the "creativity" of generated text, low to high
+temperature = 0.5 # in (0, 1], control the "creativity" of generated text, low to high
 print("\n--- inference ---")
 for sample_idx in range(20):
     keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
     token_id = BOS
-    print(f"sample {sample_idx+1}: ", end="")
+    sample = []
     for pos_id in range(block_size):
         logits = gpt(token_id, pos_id, keys, values)
         probs = softmax([l / temperature for l in logits])
         token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0]
         if token_id == BOS:
             break
-        print(itos[token_id], end="")
-    print()
+        sample.append(uchars[token_id])
+    print(f"sample {sample_idx+1:2d}: {''.join(sample)}")
diff --git a/microgpt.py b/microgpt.py
@@ -132,7 +132,7 @@ def __repr__(self): return f"Value(data={self.data}, grad={self.grad})"
 params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value]
 print(f"num params: {len(params)}")
 
-# Define the model architecture, a stateless function token streams and model parameters to logits over what comes next.
+# Define the model architecture: a stateless function mapping token sequence and parameters to logits over what comes next.
 # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2
 def linear(x, w):
     return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]

diff --git a/microgpt.py b/microgpt.py
@@ -16,7 +16,8 @@
 # Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names)
 if not os.path.exists('input.txt'):
     import urllib.request
-    urllib.request.urlretrieve('https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt', 'input.txt')
+    names_url = 'https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt'
+    urllib.request.urlretrieve(names_url, 'input.txt')
 docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents
 random.shuffle(docs)
 print(f"num docs: {len(docs)}")
@@ -132,7 +133,7 @@ def __repr__(self): return f"Value(data={self.data}, grad={self.grad})"
 print(f"num params: {len(params)}")
 
 # Define the model architecture, a stateless function token streams and model parameters to logits over what comes next.
-# Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2, no weight tying
+# Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2
 def linear(x, w):
     return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]
 

diff --git a/microgpt.py b/microgpt.py
@@ -1,6 +1,7 @@
 """
 The most atomic way to train and inference a GPT in pure, dependency-free Python.
-This file is the complete algorithm. Everything else is just efficiency.
+This file is the complete algorithm.
+Everything else is just efficiency.
 
 @karpathy
 """

diff --git a/microgpt.py b/microgpt.py
@@ -1,47 +1,37 @@
 """
-The most atomic way to train and inference a GPT LLM in pure, dependency-free Python.
-Differences from GPT-2 are minor: layer norm -> rmsnorm, no biases, GeLU -> square ReLU, no weight tying.
-The contents of this file is everything algorithmically needed to train a GPT. Everything else is just efficiency.
-Art project by @karpathy.
+The most atomic way to train and inference a GPT in pure, dependency-free Python.
+This file is the complete algorithm. Everything else is just efficiency.
+
+@karpathy
 """
 
-import os       # for os.path.exists
-import time     # for time.perf_counter
-import math     # for math.log, math.exp
-import random   # for random.seed, random.choices
-import argparse # for argparse.ArgumentParser
-
-# CLI arguments
-parser = argparse.ArgumentParser()
-parser.add_argument('--n-embd', type=int, default=16, help='Number of channels in the Transformer')
-parser.add_argument('--n-layer', type=int, default=1, help='Number of layers in the Transformer')
-parser.add_argument('--block-size', type=int, default=8, help='Maximum sequence length')
-parser.add_argument('--num-steps', type=int, default=500, help='Number of training steps')
-parser.add_argument('--n-head', type=int, default=4, help='Number of attention heads in the Transformer')
-parser.add_argument('--learning-rate', type=float, default=1e-2, help='Learning rate')
-args = parser.parse_args()
-n_embd, block_size, n_layer, n_head = args.n_embd, args.block_size, args.n_layer, args.n_head
-head_dim = n_embd // n_head
+import os       # os.path.exists
+import math     # math.log, math.exp
+import random   # random.seed, random.choices, random.gauss, random.shuffle
+
+# Let there be order among chaos
 random.seed(42)
 
-# Dataset example: the names dataset (one name per line). rest of the code just assumes docs: list[str]
+# Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names)
 if not os.path.exists('input.txt'):
     import urllib.request
     urllib.request.urlretrieve('https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt', 'input.txt')
 docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents
 random.shuffle(docs)
+print(f"num docs: {len(docs)}")
 
-# Tokenizer: simple character-level tokenization with a BOS token delimiter
-chars = ['<BOS>'] + sorted(set(''.join(docs)))
+# Let there be a Tokenizer to translate strings to discrete symbols and back
+chars = ['<BOS>'] + sorted(set(''.join(docs))) # character-level tokenizer with a BOS delimiter
 vocab_size = len(chars)
-stoi = { ch:i for i, ch in enumerate(chars) } # string to integer
-itos = { i:ch for i, ch in enumerate(chars) } # integer to string
+stoi = { ch:i for i, ch in enumerate(chars) } # encoding: map string to integer
+itos = { i:ch for i, ch in enumerate(chars) } # decoding: map integer to string
 BOS = stoi['<BOS>']
-print(f"vocab size: {vocab_size}, num docs: {len(docs)}")
+print(f"vocab size: {vocab_size}")
 
-# Autograd engine
+# Let there be an Autograd to apply the chain rule recursively across a computation graph and so
+# calculate the gradients of the loss with respect to model parameters.
 class Value:
-    """ stores a single scalar value and its gradient """
+    """Stores a single scalar value and its gradient."""
 
     def __init__(self, data, _children=(), _op=''):
         self.data = data
@@ -122,7 +112,12 @@ def __truediv__(self, other): return self * other**-1
     def __rtruediv__(self, other): return other * self**-1
     def __repr__(self): return f"Value(data={self.data}, grad={self.grad})"
 
-# Model parameter initialization
+# Initialize the parameters, to store the knowledge of the model.
+n_embd = 16     # embedding dimension
+n_head = 4      # number of attention heads
+n_layer = 1     # number of layers
+block_size = 8  # maximum sequence length
+head_dim = n_embd // n_head # dimension of each head
 matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)]
 state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)}
 for i in range(n_layer):
@@ -135,7 +130,8 @@ def __repr__(self): return f"Value(data={self.data}, grad={self.grad})"
 params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value]
 print(f"num params: {len(params)}")
 
-# Model architecture
+# Define the model architecture, a stateless function token streams and model parameters to logits over what comes next.
+# Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2, no weight tying
 def linear(x, w):
     return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]
 
@@ -188,23 +184,21 @@ def gpt(token_id, pos_id, keys, values):
     logits = linear(x, state_dict['lm_head'])
     return logits
 
-# Adam optimizer
-learning_rate = args.learning_rate
-beta1, beta2, eps_adam = 0.9, 0.95, 1e-8
-m = [0.0] * len(params) # first moment
-v = [0.0] * len(params) # second moment
+# Let there be Adam, the blessed optimizer and its buffers
+learning_rate, beta1, beta2, eps_adam = 1e-2, 0.9, 0.95, 1e-8
+m = [0.0] * len(params) # first moment buffer
+v = [0.0] * len(params) # second moment buffer
 
-# Training loop
-lossf_history = []
-t_start = time.perf_counter()
-for step in range(args.num_steps):
+# Repeat in sequence
+num_steps = 500 # number of training steps
+for step in range(num_steps):
 
-    # Take a single training document, tokenize it, surround it with BOS special token on both sides
+    # Take single document, tokenize it, surround it with BOS special token on both sides
     doc = docs[step % len(docs)]
     tokens = [BOS] + [stoi[ch] for ch in doc] + [BOS]
     n = min(block_size, len(tokens) - 1)
 
-    # Forward/backward through the document over time dimension
+    # Forward the token sequence through the model, building up the computation graph all the way to the loss.
     keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
     losses = []
     for pos_id in range(n):
@@ -213,11 +207,13 @@ def gpt(token_id, pos_id, keys, values):
         probs = softmax(logits)
         loss_t = -probs[target_id].log()
         losses.append(loss_t)
-    loss = (1 / n) * sum(losses) # average loss over the sequence
+    loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low.
+
+    # Backward the loss, calculating the gradients with respect to all model parameters.
     loss.backward()
 
-    # Adam update (optimizer)
-    lr_t = learning_rate * (1 - step / args.num_steps)
+    # Adam optimizer update: update the model parameters based on the corresponding gradients.
+    lr_t = learning_rate * (1 - step / num_steps)
     for i, p in enumerate(params):
         m[i] = beta1 * m[i] + (1 - beta1) * p.grad
         v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2
@@ -226,13 +222,10 @@ def gpt(token_id, pos_id, keys, values):
         p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam)
         p.grad = 0
 
-    lossf_history.append(loss.data)
-    print(f"step {step+1:4d} / {args.num_steps:4d} | loss {loss.data:.4f}")
-print(f"mean loss last 50 steps: {sum(lossf_history[-50:]) / len(lossf_history[-50:]):.4f}") # ~usable for basic kwarg tuning
-print(f"training time: {time.perf_counter() - t_start:.2f}s") # ~usable for basic performance benchmarking
+    print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}")
 
-# Inference: generate 5 samples
-temperature = 0.5 # number in (0, 1] that controls the "creativity" of generated text, low to high
+# Inference: may the model babble back to us
+temperature = 0.6 # in (0, 1], control the "creativity" of generated text, low to high
 print("\n--- inference ---")
 for sample_idx in range(20):
     keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]

diff --git a/microgpt.py b/microgpt.py
@@ -1,23 +1,24 @@
 """
 The most atomic way to train and inference a GPT LLM in pure, dependency-free Python.
-Differences from GPT-2 are minor: rmsnorm instead of layer norm, no biases, square ReLU instead of GeLU nonlinearity, no weight tying.
+Differences from GPT-2 are minor: layer norm -> rmsnorm, no biases, GeLU -> square ReLU, no weight tying.
 The contents of this file is everything algorithmically needed to train a GPT. Everything else is just efficiency.
 Art project by @karpathy.
 """
 
 import os       # for os.path.exists
+import time     # for time.perf_counter
 import math     # for math.log, math.exp
 import random   # for random.seed, random.choices
 import argparse # for argparse.ArgumentParser
 
 # CLI arguments
 parser = argparse.ArgumentParser()
-parser.add_argument('--n_embd', type=int, default=16, help='Number of channels in the Transformer')
-parser.add_argument('--n_layer', type=int, default=1, help='Number of layers in the Transformer')
-parser.add_argument('--block_size', type=int, default=8, help='Maximum sequence length')
-parser.add_argument('--num_steps', type=int, default=1000, help='Number of training steps')
-parser.add_argument('--n_head', type=int, default=4, help='Number of attention heads in the Transformer')
-parser.add_argument('--learning_rate', type=float, default=1e-2, help='Learning rate')
+parser.add_argument('--n-embd', type=int, default=16, help='Number of channels in the Transformer')
+parser.add_argument('--n-layer', type=int, default=1, help='Number of layers in the Transformer')
+parser.add_argument('--block-size', type=int, default=8, help='Maximum sequence length')
+parser.add_argument('--num-steps', type=int, default=500, help='Number of training steps')
+parser.add_argument('--n-head', type=int, default=4, help='Number of attention heads in the Transformer')
+parser.add_argument('--learning-rate', type=float, default=1e-2, help='Learning rate')
 args = parser.parse_args()
 n_embd, block_size, n_layer, n_head = args.n_embd, args.block_size, args.n_layer, args.n_head
 head_dim = n_embd // n_head
@@ -195,6 +196,7 @@ def gpt(token_id, pos_id, keys, values):
 
 # Training loop
 lossf_history = []
+t_start = time.perf_counter()
 for step in range(args.num_steps):
 
     # Take a single training document, tokenize it, surround it with BOS special token on both sides
@@ -224,16 +226,18 @@ def gpt(token_id, pos_id, keys, values):
         p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam)
         p.grad = 0
 
-    print(f"step {step+1} / {args.num_steps} | loss {loss.data:.4f}")
     lossf_history.append(loss.data)
+    print(f"step {step+1:4d} / {args.num_steps:4d} | loss {loss.data:.4f}")
+print(f"mean loss last 50 steps: {sum(lossf_history[-50:]) / len(lossf_history[-50:]):.4f}") # ~usable for basic kwarg tuning
+print(f"training time: {time.perf_counter() - t_start:.2f}s") # ~usable for basic performance benchmarking
 
 # Inference: generate 5 samples
 temperature = 0.5 # number in (0, 1] that controls the "creativity" of generated text, low to high
-print("\n--- generation ---")
-for sample_idx in range(5):
+print("\n--- inference ---")
+for sample_idx in range(20):
     keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
     token_id = BOS
-    print(f"sample {sample_idx}: ", end="")
+    print(f"sample {sample_idx+1}: ", end="")
     for pos_id in range(block_size):
         logits = gpt(token_id, pos_id, keys, values)
         probs = softmax([l / temperature for l in logits])
@@ -242,5 +246,3 @@ def gpt(token_id, pos_id, keys, values):
             break
         print(itos[token_id], end="")
     print()
-
-print(f"mean loss last 50 steps: {sum(lossf_history[-50:]) / len(lossf_history[-50:]):.4f}")
diff --git a/microgpt.py b/microgpt.py
@@ -1,6 +1,6 @@
 """
 The most atomic way to train and inference a GPT LLM in pure, dependency-free Python.
-Differences from GPT-2 are minor: rmsnorm instead of layer norm, no biases, square ReLU instead of GeLU nonlinearity.
+Differences from GPT-2 are minor: rmsnorm instead of layer norm, no biases, square ReLU instead of GeLU nonlinearity, no weight tying.
 The contents of this file is everything algorithmically needed to train a GPT. Everything else is just efficiency.
 Art project by @karpathy.
 """
@@ -18,27 +18,24 @@
 parser.add_argument('--num_steps', type=int, default=1000, help='Number of training steps')
 parser.add_argument('--n_head', type=int, default=4, help='Number of attention heads in the Transformer')
 parser.add_argument('--learning_rate', type=float, default=1e-2, help='Learning rate')
-parser.add_argument('--seed', type=int, default=42, help='Random seed')
 args = parser.parse_args()
-random.seed(args.seed)
 n_embd, block_size, n_layer, n_head = args.n_embd, args.block_size, args.n_layer, args.n_head
 head_dim = n_embd // n_head
+random.seed(42)
 
 # Dataset example: the names dataset (one name per line). rest of the code just assumes docs: list[str]
 if not os.path.exists('input.txt'):
     import urllib.request
     urllib.request.urlretrieve('https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt', 'input.txt')
-with open('input.txt', 'r') as file:
-    text = file.read()
-docs = [line.strip() for line in text.strip().split('\n') if line.strip()]
+docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents
 random.shuffle(docs)
 
-# Tokenizer: simple character-level tokenization with BOS/EOS tokens
-chars = ['<BOS>', '<EOS>'] + sorted(list(set(''.join(docs))))
+# Tokenizer: simple character-level tokenization with a BOS token delimiter
+chars = ['<BOS>'] + sorted(set(''.join(docs)))
 vocab_size = len(chars)
 stoi = { ch:i for i, ch in enumerate(chars) } # string to integer
 itos = { i:ch for i, ch in enumerate(chars) } # integer to string
-BOS, EOS = stoi['<BOS>'], stoi['<EOS>']
+BOS = stoi['<BOS>']
 print(f"vocab size: {vocab_size}, num docs: {len(docs)}")
 
 # Autograd engine
@@ -126,24 +123,24 @@ def __repr__(self): return f"Value(data={self.data}, grad={self.grad})"
 
 # Model parameter initialization
 matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)]
-state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd)}
+state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)}
 for i in range(n_layer):
     state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd)
     state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd)
     state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd)
     state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd, std=0)
     state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd)
     state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd, std=0)
-params = [p for mat in state_dict.values() for row in mat for p in row]
+params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value]
 print(f"num params: {len(params)}")
 
 # Model architecture
 def linear(x, w):
-    return [sum(w[o][i] * x[i] for i in range(len(x))) for o in range(len(w))]
+    return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]
 
 def softmax(logits):
-    max_val = max(v.data for v in logits)
-    exps = [(v - max_val).exp() for v in logits]
+    max_val = max(val.data for val in logits)
+    exps = [(val - max_val).exp() for val in logits]
     total = sum(exps)
     return [e / total for e in exps]
 
@@ -154,18 +151,19 @@ def rmsnorm(x):
 
 def gpt(token_id, pos_id, keys, values):
     tok_emb = state_dict['wte'][token_id] # token embedding
-    pos_emb = state_dict['wpe'][pos_id % block_size] # position embedding
+    pos_emb = state_dict['wpe'][pos_id] # position embedding
     x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding
+    x = rmsnorm(x)
 
     for li in range(n_layer):
         # 1) Multi-head attention block
         x_residual = x
         x = rmsnorm(x)
         q = linear(x, state_dict[f'layer{li}.attn_wq'])
         k = linear(x, state_dict[f'layer{li}.attn_wk'])
-        val = linear(x, state_dict[f'layer{li}.attn_wv'])
+        v = linear(x, state_dict[f'layer{li}.attn_wv'])
         keys[li].append(k)
-        values[li].append(val)
+        values[li].append(v)
         x_attn = []
         for h in range(n_head):
             hs = h * head_dim
@@ -186,8 +184,7 @@ def gpt(token_id, pos_id, keys, values):
         x = linear(x, state_dict[f'layer{li}.mlp_fc2'])
         x = [a + b for a, b in zip(x, x_residual)]
 
-    # project to vocab (weight tying with wte)
-    logits = linear(x, state_dict['wte'])
+    logits = linear(x, state_dict['lm_head'])
     return logits
 
 # Adam optimizer
@@ -197,23 +194,25 @@ def gpt(token_id, pos_id, keys, values):
 v = [0.0] * len(params) # second moment
 
 # Training loop
+lossf_history = []
 for step in range(args.num_steps):
 
-    # Take a single training document, tokenize it, and crop to block_size
+    # Take a single training document, tokenize it, surround it with BOS special token on both sides
     doc = docs[step % len(docs)]
-    tokens = [BOS] + [stoi[ch] for ch in doc] + [EOS]
-    tokens = tokens[:block_size]
+    tokens = [BOS] + [stoi[ch] for ch in doc] + [BOS]
+    n = min(block_size, len(tokens) - 1)
 
-    # Forward pass through the document over time dimension
+    # Forward/backward through the document over time dimension
     keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
-    lossf = 0.0
-    for pos_id in range(len(tokens) - 1):
-        logits = gpt(tokens[pos_id], pos_id, keys, values)
+    losses = []
+    for pos_id in range(n):
+        token_id, target_id = tokens[pos_id], tokens[pos_id + 1]
+        logits = gpt(token_id, pos_id, keys, values)
         probs = softmax(logits)
-        loss = -probs[tokens[pos_id + 1]].log()
-        loss = (1 / (len(tokens) - 1)) * loss # average over sequence length
-        loss.backward()
-        lossf += loss.data
+        loss_t = -probs[target_id].log()
+        losses.append(loss_t)
+    loss = (1 / n) * sum(losses) # average loss over the sequence
+    loss.backward()
 
     # Adam update (optimizer)
     lr_t = learning_rate * (1 - step / args.num_steps)
@@ -225,19 +224,23 @@ def gpt(token_id, pos_id, keys, values):
         p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam)
         p.grad = 0
 
-    print(f"step {step+1} / {args.num_steps} | loss {lossf:.4f}")
+    print(f"step {step+1} / {args.num_steps} | loss {loss.data:.4f}")
+    lossf_history.append(loss.data)
 
 # Inference: generate 5 samples
+temperature = 0.5 # number in (0, 1] that controls the "creativity" of generated text, low to high
 print("\n--- generation ---")
 for sample_idx in range(5):
     keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
     token_id = BOS
-    generated = []
+    print(f"sample {sample_idx}: ", end="")
     for pos_id in range(block_size):
         logits = gpt(token_id, pos_id, keys, values)
-        probs = softmax(logits)
+        probs = softmax([l / temperature for l in logits])
         token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0]
-        if token_id == EOS:
+        if token_id == BOS:
             break
-        generated.append(itos[token_id])
-    print(f"sample {sample_idx}: {''.join(generated)}")
+        print(itos[token_id], end="")
+    print()
+
+print(f"mean loss last 50 steps: {sum(lossf_history[-50:]) / len(lossf_history[-50:]):.4f}")
diff --git a/microgpt.py b/microgpt.py
@@ -0,0 +1,243 @@
+"""
+The most atomic way to train and inference a GPT LLM in pure, dependency-free Python.
+Differences from GPT-2 are minor: rmsnorm instead of layer norm, no biases, square ReLU instead of GeLU nonlinearity.
+The contents of this file is everything algorithmically needed to train a GPT. Everything else is just efficiency.
+Art project by @karpathy.
+"""
+
+import os       # for os.path.exists
+import math     # for math.log, math.exp
+import random   # for random.seed, random.choices
+import argparse # for argparse.ArgumentParser
+
+# CLI arguments
+parser = argparse.ArgumentParser()
+parser.add_argument('--n_embd', type=int, default=16, help='Number of channels in the Transformer')
+parser.add_argument('--n_layer', type=int, default=1, help='Number of layers in the Transformer')
+parser.add_argument('--block_size', type=int, default=8, help='Maximum sequence length')
+parser.add_argument('--num_steps', type=int, default=1000, help='Number of training steps')
+parser.add_argument('--n_head', type=int, default=4, help='Number of attention heads in the Transformer')
+parser.add_argument('--learning_rate', type=float, default=1e-2, help='Learning rate')
+parser.add_argument('--seed', type=int, default=42, help='Random seed')
+args = parser.parse_args()
+random.seed(args.seed)
+n_embd, block_size, n_layer, n_head = args.n_embd, args.block_size, args.n_layer, args.n_head
+head_dim = n_embd // n_head
+
+# Dataset example: the names dataset (one name per line). rest of the code just assumes docs: list[str]
+if not os.path.exists('input.txt'):
+    import urllib.request
+    urllib.request.urlretrieve('https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt', 'input.txt')
+with open('input.txt', 'r') as file:
+    text = file.read()
+docs = [line.strip() for line in text.strip().split('\n') if line.strip()]
+random.shuffle(docs)
+
+# Tokenizer: simple character-level tokenization with BOS/EOS tokens
+chars = ['<BOS>', '<EOS>'] + sorted(list(set(''.join(docs))))
+vocab_size = len(chars)
+stoi = { ch:i for i, ch in enumerate(chars) } # string to integer
+itos = { i:ch for i, ch in enumerate(chars) } # integer to string
+BOS, EOS = stoi['<BOS>'], stoi['<EOS>']
+print(f"vocab size: {vocab_size}, num docs: {len(docs)}")
+
+# Autograd engine
+class Value:
+    """ stores a single scalar value and its gradient """
+
+    def __init__(self, data, _children=(), _op=''):
+        self.data = data
+        self.grad = 0
+        self._backward = lambda: None
+        self._prev = set(_children)
+        self._op = _op # the op that produced this node, for graphviz / debugging / etc
+
+    def __add__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        out = Value(self.data + other.data, (self, other), '+')
+        def _backward():
+            self.grad += out.grad
+            other.grad += out.grad
+        out._backward = _backward
+        return out
+
+    def __mul__(self, other):
+        other = other if isinstance(other, Value) else Value(other)
+        out = Value(self.data * other.data, (self, other), '*')
+        def _backward():
+            self.grad += other.data * out.grad
+            other.grad += self.data * out.grad
+        out._backward = _backward
+        return out
+
+    def __pow__(self, other):
+        assert isinstance(other, (int, float)), "only supporting int/float powers for now"
+        out = Value(self.data**other, (self,), f'**{other}')
+        def _backward():
+            self.grad += (other * self.data**(other-1)) * out.grad
+        out._backward = _backward
+        return out
+
+    def log(self):
+        out = Value(math.log(self.data), (self,), 'log')
+        def _backward():
+            self.grad += (1 / self.data) * out.grad
+        out._backward = _backward
+        return out
+
+    def exp(self):
+        out = Value(math.exp(self.data), (self,), 'exp')
+        def _backward():
+            self.grad += out.data * out.grad
+        out._backward = _backward
+        return out
+
+    def relu(self):
+        out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')
+        def _backward():
+            self.grad += (out.data > 0) * out.grad
+        out._backward = _backward
+        return out
+
+    def backward(self):
+        # topological order all of the children in the graph
+        topo = []
+        visited = set()
+        def build_topo(v):
+            if v not in visited:
+                visited.add(v)
+                for child in v._prev:
+                    build_topo(child)
+                topo.append(v)
+        build_topo(self)
+        # go one variable at a time and apply the chain rule to get its gradient
+        self.grad = 1
+        for v in reversed(topo):
+            v._backward()
+
+    def __neg__(self): return self * -1
+    def __radd__(self, other): return self + other
+    def __sub__(self, other): return self + (-other)
+    def __rsub__(self, other): return other + (-self)
+    def __rmul__(self, other): return self * other
+    def __truediv__(self, other): return self * other**-1
+    def __rtruediv__(self, other): return other * self**-1
+    def __repr__(self): return f"Value(data={self.data}, grad={self.grad})"
+
+# Model parameter initialization
+matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)]
+state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd)}
+for i in range(n_layer):
+    state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd)
+    state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd)
+    state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd)
+    state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd, std=0)
+    state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd)
+    state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd, std=0)
+params = [p for mat in state_dict.values() for row in mat for p in row]
+print(f"num params: {len(params)}")
+
+# Model architecture
+def linear(x, w):
+    return [sum(w[o][i] * x[i] for i in range(len(x))) for o in range(len(w))]
+
+def softmax(logits):
+    max_val = max(v.data for v in logits)
+    exps = [(v - max_val).exp() for v in logits]
+    total = sum(exps)
+    return [e / total for e in exps]
+
+def rmsnorm(x):
+    ms = sum(xi * xi for xi in x) / len(x)
+    scale = (ms + 1e-5) ** -0.5
+    return [xi * scale for xi in x]
+
+def gpt(token_id, pos_id, keys, values):
+    tok_emb = state_dict['wte'][token_id] # token embedding
+    pos_emb = state_dict['wpe'][pos_id % block_size] # position embedding
+    x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding
+
+    for li in range(n_layer):
+        # 1) Multi-head attention block
+        x_residual = x
+        x = rmsnorm(x)
+        q = linear(x, state_dict[f'layer{li}.attn_wq'])
+        k = linear(x, state_dict[f'layer{li}.attn_wk'])
+        val = linear(x, state_dict[f'layer{li}.attn_wv'])
+        keys[li].append(k)
+        values[li].append(val)
+        x_attn = []
+        for h in range(n_head):
+            hs = h * head_dim
+            q_h = q[hs:hs+head_dim]
+            k_h = [ki[hs:hs+head_dim] for ki in keys[li]]
+            v_h = [vi[hs:hs+head_dim] for vi in values[li]]
+            attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))]
+            attn_weights = softmax(attn_logits)
+            head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)]
+            x_attn.extend(head_out)
+        x = linear(x_attn, state_dict[f'layer{li}.attn_wo'])
+        x = [a + b for a, b in zip(x, x_residual)]
+        # 2) MLP block
+        x_residual = x
+        x = rmsnorm(x)
+        x = linear(x, state_dict[f'layer{li}.mlp_fc1'])
+        x = [xi.relu() ** 2 for xi in x]
+        x = linear(x, state_dict[f'layer{li}.mlp_fc2'])
+        x = [a + b for a, b in zip(x, x_residual)]
+
+    # project to vocab (weight tying with wte)
+    logits = linear(x, state_dict['wte'])
+    return logits
+
+# Adam optimizer
+learning_rate = args.learning_rate
+beta1, beta2, eps_adam = 0.9, 0.95, 1e-8
+m = [0.0] * len(params) # first moment
+v = [0.0] * len(params) # second moment
+
+# Training loop
+for step in range(args.num_steps):
+
+    # Take a single training document, tokenize it, and crop to block_size
+    doc = docs[step % len(docs)]
+    tokens = [BOS] + [stoi[ch] for ch in doc] + [EOS]
+    tokens = tokens[:block_size]
+
+    # Forward pass through the document over time dimension
+    keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
+    lossf = 0.0
+    for pos_id in range(len(tokens) - 1):
+        logits = gpt(tokens[pos_id], pos_id, keys, values)
+        probs = softmax(logits)
+        loss = -probs[tokens[pos_id + 1]].log()
+        loss = (1 / (len(tokens) - 1)) * loss # average over sequence length
+        loss.backward()
+        lossf += loss.data
+
+    # Adam update (optimizer)
+    lr_t = learning_rate * (1 - step / args.num_steps)
+    for i, p in enumerate(params):
+        m[i] = beta1 * m[i] + (1 - beta1) * p.grad
+        v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2
+        m_hat = m[i] / (1 - beta1 ** (step + 1))
+        v_hat = v[i] / (1 - beta2 ** (step + 1))
+        p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam)
+        p.grad = 0
+
+    print(f"step {step+1} / {args.num_steps} | loss {lossf:.4f}")
+
+# Inference: generate 5 samples
+print("\n--- generation ---")
+for sample_idx in range(5):
+    keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
+    token_id = BOS
+    generated = []
+    for pos_id in range(block_size):
+        logits = gpt(token_id, pos_id, keys, values)
+        probs = softmax(logits)
+        token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0]
+        if token_id == EOS:
+            break
+        generated.append(itos[token_id])
+    print(f"sample {sample_idx}: {''.join(generated)}")
No results found