-
-
Save mystix/eafdef9202ffac3f905a34a8da731740 to your computer and use it in GitHub Desktop.
Revisions
-
karpathy revised this gist
Feb 12, 2026 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -152,7 +152,7 @@ def gpt(token_id, pos_id, keys, values): num_steps = 500 # number of training steps for step in range(num_steps): # Take single document, tokenize it, surround it with BOS special token on both sides doc = docs[step % len(docs)] tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS] n = min(block_size, len(tokens) - 1) -
karpathy revised this gist
Feb 12, 2026 . 1 changed file with 35 additions and 78 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -9,9 +9,7 @@ import os # os.path.exists import math # math.log, math.exp import random # random.seed, random.choices, random.gauss, random.shuffle random.seed(42) # Let there be order among chaos # Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names) if not os.path.exists('input.txt'): @@ -23,96 +21,55 @@ print(f"num docs: {len(docs)}") # Let there be a Tokenizer to translate strings to discrete symbols and back uchars = sorted(set(''.join(docs))) # unique characters in the dataset become token ids 0..n-1 BOS = len(uchars) # token id for the special Beginning of Sequence (BOS) token vocab_size = len(uchars) + 1 # total number of unique tokens, +1 is for BOS print(f"vocab size: {vocab_size}") # Let there be an Autograd to apply the chain rule recursively across a computation graph class Value: """Stores a single scalar value and its gradient, as a node in a computation graph.""" def __init__(self, data, children=(), local_grads=()): self.data = data # scalar value of this node calculated during forward pass self.grad = 0 # derivative of the loss w.r.t. this node, calculated in backward pass self._children = children # children of this node in the computation graph self._local_grads = local_grads # local derivative of this node w.r.t. its children def __add__(self, other): other = other if isinstance(other, Value) else Value(other) return Value(self.data + other.data, (self, other), (1, 1)) def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) return Value(self.data * other.data, (self, other), (other.data, self.data)) def __pow__(self, other): return Value(self.data**other, (self,), (other * self.data**(other-1),)) def log(self): return Value(math.log(self.data), (self,), (1/self.data,)) def exp(self): return Value(math.exp(self.data), (self,), (math.exp(self.data),)) def relu(self): return Value(max(0, self.data), (self,), (float(self.data > 0),)) def __neg__(self): return self * -1 def __radd__(self, other): return self + other def __sub__(self, other): return self + (-other) def __rsub__(self, other): return other + (-self) def __rmul__(self, other): return self * other def __truediv__(self, other): return self * other**-1 def __rtruediv__(self, other): return other * self**-1 def backward(self): topo = [] visited = set() def build_topo(v): if v not in visited: visited.add(v) for child in v._children: build_topo(child) topo.append(v) build_topo(self) self.grad = 1 for v in reversed(topo): for child, local_grad in zip(v._children, v._local_grads): child.grad += local_grad * v.grad # Initialize the parameters, to store the knowledge of the model. n_embd = 16 # embedding dimension @@ -195,9 +152,9 @@ def gpt(token_id, pos_id, keys, values): num_steps = 500 # number of training steps for step in range(num_steps): # Take single document, tokenize it, surround it with BOS special token (token id 0) on both sides doc = docs[step % len(docs)] tokens = [BOS] + [uchars.index(ch) for ch in doc] + [BOS] n = min(block_size, len(tokens) - 1) # Forward the token sequence through the model, building up the computation graph all the way to the loss. @@ -215,7 +172,7 @@ def gpt(token_id, pos_id, keys, values): loss.backward() # Adam optimizer update: update the model parameters based on the corresponding gradients. lr_t = learning_rate * 0.5 * (1 + math.cos(math.pi * step / num_steps)) # cosine learning rate decay for i, p in enumerate(params): m[i] = beta1 * m[i] + (1 - beta1) * p.grad v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 @@ -227,17 +184,17 @@ def gpt(token_id, pos_id, keys, values): print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}") # Inference: may the model babble back to us temperature = 0.5 # in (0, 1], control the "creativity" of generated text, low to high print("\n--- inference ---") for sample_idx in range(20): keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] token_id = BOS sample = [] for pos_id in range(block_size): logits = gpt(token_id, pos_id, keys, values) probs = softmax([l / temperature for l in logits]) token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0] if token_id == BOS: break sample.append(uchars[token_id]) print(f"sample {sample_idx+1:2d}: {''.join(sample)}") -
karpathy revised this gist
Feb 12, 2026 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -132,7 +132,7 @@ def __repr__(self): return f"Value(data={self.data}, grad={self.grad})" params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value] print(f"num params: {len(params)}") # Define the model architecture: a stateless function mapping token sequence and parameters to logits over what comes next. # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2 def linear(x, w): return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] -
karpathy revised this gist
Feb 12, 2026 . 1 changed file with 3 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,7 +16,8 @@ # Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names) if not os.path.exists('input.txt'): import urllib.request names_url = 'https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt' urllib.request.urlretrieve(names_url, 'input.txt') docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents random.shuffle(docs) print(f"num docs: {len(docs)}") @@ -132,7 +133,7 @@ def __repr__(self): return f"Value(data={self.data}, grad={self.grad})" print(f"num params: {len(params)}") # Define the model architecture, a stateless function token streams and model parameters to logits over what comes next. # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2 def linear(x, w): return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] -
karpathy revised this gist
Feb 12, 2026 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,7 @@ """ The most atomic way to train and inference a GPT in pure, dependency-free Python. This file is the complete algorithm. Everything else is just efficiency. @karpathy """ -
karpathy revised this gist
Feb 12, 2026 . 1 changed file with 44 additions and 51 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,47 +1,37 @@ """ The most atomic way to train and inference a GPT in pure, dependency-free Python. This file is the complete algorithm. Everything else is just efficiency. @karpathy """ import os # os.path.exists import math # math.log, math.exp import random # random.seed, random.choices, random.gauss, random.shuffle # Let there be order among chaos random.seed(42) # Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names) if not os.path.exists('input.txt'): import urllib.request urllib.request.urlretrieve('https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt', 'input.txt') docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents random.shuffle(docs) print(f"num docs: {len(docs)}") # Let there be a Tokenizer to translate strings to discrete symbols and back chars = ['<BOS>'] + sorted(set(''.join(docs))) # character-level tokenizer with a BOS delimiter vocab_size = len(chars) stoi = { ch:i for i, ch in enumerate(chars) } # encoding: map string to integer itos = { i:ch for i, ch in enumerate(chars) } # decoding: map integer to string BOS = stoi['<BOS>'] print(f"vocab size: {vocab_size}") # Let there be an Autograd to apply the chain rule recursively across a computation graph and so # calculate the gradients of the loss with respect to model parameters. class Value: """Stores a single scalar value and its gradient.""" def __init__(self, data, _children=(), _op=''): self.data = data @@ -122,7 +112,12 @@ def __truediv__(self, other): return self * other**-1 def __rtruediv__(self, other): return other * self**-1 def __repr__(self): return f"Value(data={self.data}, grad={self.grad})" # Initialize the parameters, to store the knowledge of the model. n_embd = 16 # embedding dimension n_head = 4 # number of attention heads n_layer = 1 # number of layers block_size = 8 # maximum sequence length head_dim = n_embd // n_head # dimension of each head matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)] state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)} for i in range(n_layer): @@ -135,7 +130,8 @@ def __repr__(self): return f"Value(data={self.data}, grad={self.grad})" params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value] print(f"num params: {len(params)}") # Define the model architecture, a stateless function token streams and model parameters to logits over what comes next. # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2, no weight tying def linear(x, w): return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] @@ -188,23 +184,21 @@ def gpt(token_id, pos_id, keys, values): logits = linear(x, state_dict['lm_head']) return logits # Let there be Adam, the blessed optimizer and its buffers learning_rate, beta1, beta2, eps_adam = 1e-2, 0.9, 0.95, 1e-8 m = [0.0] * len(params) # first moment buffer v = [0.0] * len(params) # second moment buffer # Repeat in sequence num_steps = 500 # number of training steps for step in range(num_steps): # Take single document, tokenize it, surround it with BOS special token on both sides doc = docs[step % len(docs)] tokens = [BOS] + [stoi[ch] for ch in doc] + [BOS] n = min(block_size, len(tokens) - 1) # Forward the token sequence through the model, building up the computation graph all the way to the loss. keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] losses = [] for pos_id in range(n): @@ -213,11 +207,13 @@ def gpt(token_id, pos_id, keys, values): probs = softmax(logits) loss_t = -probs[target_id].log() losses.append(loss_t) loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low. # Backward the loss, calculating the gradients with respect to all model parameters. loss.backward() # Adam optimizer update: update the model parameters based on the corresponding gradients. lr_t = learning_rate * (1 - step / num_steps) for i, p in enumerate(params): m[i] = beta1 * m[i] + (1 - beta1) * p.grad v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 @@ -226,13 +222,10 @@ def gpt(token_id, pos_id, keys, values): p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) p.grad = 0 print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}") # Inference: may the model babble back to us temperature = 0.6 # in (0, 1], control the "creativity" of generated text, low to high print("\n--- inference ---") for sample_idx in range(20): keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] -
karpathy revised this gist
Feb 11, 2026 . 1 changed file with 15 additions and 13 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,23 +1,24 @@ """ The most atomic way to train and inference a GPT LLM in pure, dependency-free Python. Differences from GPT-2 are minor: layer norm -> rmsnorm, no biases, GeLU -> square ReLU, no weight tying. The contents of this file is everything algorithmically needed to train a GPT. Everything else is just efficiency. Art project by @karpathy. """ import os # for os.path.exists import time # for time.perf_counter import math # for math.log, math.exp import random # for random.seed, random.choices import argparse # for argparse.ArgumentParser # CLI arguments parser = argparse.ArgumentParser() parser.add_argument('--n-embd', type=int, default=16, help='Number of channels in the Transformer') parser.add_argument('--n-layer', type=int, default=1, help='Number of layers in the Transformer') parser.add_argument('--block-size', type=int, default=8, help='Maximum sequence length') parser.add_argument('--num-steps', type=int, default=500, help='Number of training steps') parser.add_argument('--n-head', type=int, default=4, help='Number of attention heads in the Transformer') parser.add_argument('--learning-rate', type=float, default=1e-2, help='Learning rate') args = parser.parse_args() n_embd, block_size, n_layer, n_head = args.n_embd, args.block_size, args.n_layer, args.n_head head_dim = n_embd // n_head @@ -195,6 +196,7 @@ def gpt(token_id, pos_id, keys, values): # Training loop lossf_history = [] t_start = time.perf_counter() for step in range(args.num_steps): # Take a single training document, tokenize it, surround it with BOS special token on both sides @@ -224,16 +226,18 @@ def gpt(token_id, pos_id, keys, values): p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) p.grad = 0 lossf_history.append(loss.data) print(f"step {step+1:4d} / {args.num_steps:4d} | loss {loss.data:.4f}") print(f"mean loss last 50 steps: {sum(lossf_history[-50:]) / len(lossf_history[-50:]):.4f}") # ~usable for basic kwarg tuning print(f"training time: {time.perf_counter() - t_start:.2f}s") # ~usable for basic performance benchmarking # Inference: generate 5 samples temperature = 0.5 # number in (0, 1] that controls the "creativity" of generated text, low to high print("\n--- inference ---") for sample_idx in range(20): keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] token_id = BOS print(f"sample {sample_idx+1}: ", end="") for pos_id in range(block_size): logits = gpt(token_id, pos_id, keys, values) probs = softmax([l / temperature for l in logits]) @@ -242,5 +246,3 @@ def gpt(token_id, pos_id, keys, values): break print(itos[token_id], end="") print() -
karpathy revised this gist
Feb 11, 2026 . No changes.There are no files selected for viewing
-
karpathy revised this gist
Feb 11, 2026 . 1 changed file with 39 additions and 36 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,6 +1,6 @@ """ The most atomic way to train and inference a GPT LLM in pure, dependency-free Python. Differences from GPT-2 are minor: rmsnorm instead of layer norm, no biases, square ReLU instead of GeLU nonlinearity, no weight tying. The contents of this file is everything algorithmically needed to train a GPT. Everything else is just efficiency. Art project by @karpathy. """ @@ -18,27 +18,24 @@ parser.add_argument('--num_steps', type=int, default=1000, help='Number of training steps') parser.add_argument('--n_head', type=int, default=4, help='Number of attention heads in the Transformer') parser.add_argument('--learning_rate', type=float, default=1e-2, help='Learning rate') args = parser.parse_args() n_embd, block_size, n_layer, n_head = args.n_embd, args.block_size, args.n_layer, args.n_head head_dim = n_embd // n_head random.seed(42) # Dataset example: the names dataset (one name per line). rest of the code just assumes docs: list[str] if not os.path.exists('input.txt'): import urllib.request urllib.request.urlretrieve('https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt', 'input.txt') docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents random.shuffle(docs) # Tokenizer: simple character-level tokenization with a BOS token delimiter chars = ['<BOS>'] + sorted(set(''.join(docs))) vocab_size = len(chars) stoi = { ch:i for i, ch in enumerate(chars) } # string to integer itos = { i:ch for i, ch in enumerate(chars) } # integer to string BOS = stoi['<BOS>'] print(f"vocab size: {vocab_size}, num docs: {len(docs)}") # Autograd engine @@ -126,24 +123,24 @@ def __repr__(self): return f"Value(data={self.data}, grad={self.grad})" # Model parameter initialization matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)] state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)} for i in range(n_layer): state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd) state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd) state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd) state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd, std=0) state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd) state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd, std=0) params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value] print(f"num params: {len(params)}") # Model architecture def linear(x, w): return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] def softmax(logits): max_val = max(val.data for val in logits) exps = [(val - max_val).exp() for val in logits] total = sum(exps) return [e / total for e in exps] @@ -154,18 +151,19 @@ def rmsnorm(x): def gpt(token_id, pos_id, keys, values): tok_emb = state_dict['wte'][token_id] # token embedding pos_emb = state_dict['wpe'][pos_id] # position embedding x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding x = rmsnorm(x) for li in range(n_layer): # 1) Multi-head attention block x_residual = x x = rmsnorm(x) q = linear(x, state_dict[f'layer{li}.attn_wq']) k = linear(x, state_dict[f'layer{li}.attn_wk']) v = linear(x, state_dict[f'layer{li}.attn_wv']) keys[li].append(k) values[li].append(v) x_attn = [] for h in range(n_head): hs = h * head_dim @@ -186,8 +184,7 @@ def gpt(token_id, pos_id, keys, values): x = linear(x, state_dict[f'layer{li}.mlp_fc2']) x = [a + b for a, b in zip(x, x_residual)] logits = linear(x, state_dict['lm_head']) return logits # Adam optimizer @@ -197,23 +194,25 @@ def gpt(token_id, pos_id, keys, values): v = [0.0] * len(params) # second moment # Training loop lossf_history = [] for step in range(args.num_steps): # Take a single training document, tokenize it, surround it with BOS special token on both sides doc = docs[step % len(docs)] tokens = [BOS] + [stoi[ch] for ch in doc] + [BOS] n = min(block_size, len(tokens) - 1) # Forward/backward through the document over time dimension keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] losses = [] for pos_id in range(n): token_id, target_id = tokens[pos_id], tokens[pos_id + 1] logits = gpt(token_id, pos_id, keys, values) probs = softmax(logits) loss_t = -probs[target_id].log() losses.append(loss_t) loss = (1 / n) * sum(losses) # average loss over the sequence loss.backward() # Adam update (optimizer) lr_t = learning_rate * (1 - step / args.num_steps) @@ -225,19 +224,23 @@ def gpt(token_id, pos_id, keys, values): p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) p.grad = 0 print(f"step {step+1} / {args.num_steps} | loss {loss.data:.4f}") lossf_history.append(loss.data) # Inference: generate 5 samples temperature = 0.5 # number in (0, 1] that controls the "creativity" of generated text, low to high print("\n--- generation ---") for sample_idx in range(5): keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] token_id = BOS print(f"sample {sample_idx}: ", end="") for pos_id in range(block_size): logits = gpt(token_id, pos_id, keys, values) probs = softmax([l / temperature for l in logits]) token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0] if token_id == BOS: break print(itos[token_id], end="") print() print(f"mean loss last 50 steps: {sum(lossf_history[-50:]) / len(lossf_history[-50:]):.4f}") -
karpathy created this gist
Feb 11, 2026 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,243 @@ """ The most atomic way to train and inference a GPT LLM in pure, dependency-free Python. Differences from GPT-2 are minor: rmsnorm instead of layer norm, no biases, square ReLU instead of GeLU nonlinearity. The contents of this file is everything algorithmically needed to train a GPT. Everything else is just efficiency. Art project by @karpathy. """ import os # for os.path.exists import math # for math.log, math.exp import random # for random.seed, random.choices import argparse # for argparse.ArgumentParser # CLI arguments parser = argparse.ArgumentParser() parser.add_argument('--n_embd', type=int, default=16, help='Number of channels in the Transformer') parser.add_argument('--n_layer', type=int, default=1, help='Number of layers in the Transformer') parser.add_argument('--block_size', type=int, default=8, help='Maximum sequence length') parser.add_argument('--num_steps', type=int, default=1000, help='Number of training steps') parser.add_argument('--n_head', type=int, default=4, help='Number of attention heads in the Transformer') parser.add_argument('--learning_rate', type=float, default=1e-2, help='Learning rate') parser.add_argument('--seed', type=int, default=42, help='Random seed') args = parser.parse_args() random.seed(args.seed) n_embd, block_size, n_layer, n_head = args.n_embd, args.block_size, args.n_layer, args.n_head head_dim = n_embd // n_head # Dataset example: the names dataset (one name per line). rest of the code just assumes docs: list[str] if not os.path.exists('input.txt'): import urllib.request urllib.request.urlretrieve('https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt', 'input.txt') with open('input.txt', 'r') as file: text = file.read() docs = [line.strip() for line in text.strip().split('\n') if line.strip()] random.shuffle(docs) # Tokenizer: simple character-level tokenization with BOS/EOS tokens chars = ['<BOS>', '<EOS>'] + sorted(list(set(''.join(docs)))) vocab_size = len(chars) stoi = { ch:i for i, ch in enumerate(chars) } # string to integer itos = { i:ch for i, ch in enumerate(chars) } # integer to string BOS, EOS = stoi['<BOS>'], stoi['<EOS>'] print(f"vocab size: {vocab_size}, num docs: {len(docs)}") # Autograd engine class Value: """ stores a single scalar value and its gradient """ def __init__(self, data, _children=(), _op=''): self.data = data self.grad = 0 self._backward = lambda: None self._prev = set(_children) self._op = _op # the op that produced this node, for graphviz / debugging / etc def __add__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data + other.data, (self, other), '+') def _backward(): self.grad += out.grad other.grad += out.grad out._backward = _backward return out def __mul__(self, other): other = other if isinstance(other, Value) else Value(other) out = Value(self.data * other.data, (self, other), '*') def _backward(): self.grad += other.data * out.grad other.grad += self.data * out.grad out._backward = _backward return out def __pow__(self, other): assert isinstance(other, (int, float)), "only supporting int/float powers for now" out = Value(self.data**other, (self,), f'**{other}') def _backward(): self.grad += (other * self.data**(other-1)) * out.grad out._backward = _backward return out def log(self): out = Value(math.log(self.data), (self,), 'log') def _backward(): self.grad += (1 / self.data) * out.grad out._backward = _backward return out def exp(self): out = Value(math.exp(self.data), (self,), 'exp') def _backward(): self.grad += out.data * out.grad out._backward = _backward return out def relu(self): out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU') def _backward(): self.grad += (out.data > 0) * out.grad out._backward = _backward return out def backward(self): # topological order all of the children in the graph topo = [] visited = set() def build_topo(v): if v not in visited: visited.add(v) for child in v._prev: build_topo(child) topo.append(v) build_topo(self) # go one variable at a time and apply the chain rule to get its gradient self.grad = 1 for v in reversed(topo): v._backward() def __neg__(self): return self * -1 def __radd__(self, other): return self + other def __sub__(self, other): return self + (-other) def __rsub__(self, other): return other + (-self) def __rmul__(self, other): return self * other def __truediv__(self, other): return self * other**-1 def __rtruediv__(self, other): return other * self**-1 def __repr__(self): return f"Value(data={self.data}, grad={self.grad})" # Model parameter initialization matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)] state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd)} for i in range(n_layer): state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd) state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd) state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd) state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd, std=0) state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd) state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd, std=0) params = [p for mat in state_dict.values() for row in mat for p in row] print(f"num params: {len(params)}") # Model architecture def linear(x, w): return [sum(w[o][i] * x[i] for i in range(len(x))) for o in range(len(w))] def softmax(logits): max_val = max(v.data for v in logits) exps = [(v - max_val).exp() for v in logits] total = sum(exps) return [e / total for e in exps] def rmsnorm(x): ms = sum(xi * xi for xi in x) / len(x) scale = (ms + 1e-5) ** -0.5 return [xi * scale for xi in x] def gpt(token_id, pos_id, keys, values): tok_emb = state_dict['wte'][token_id] # token embedding pos_emb = state_dict['wpe'][pos_id % block_size] # position embedding x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding for li in range(n_layer): # 1) Multi-head attention block x_residual = x x = rmsnorm(x) q = linear(x, state_dict[f'layer{li}.attn_wq']) k = linear(x, state_dict[f'layer{li}.attn_wk']) val = linear(x, state_dict[f'layer{li}.attn_wv']) keys[li].append(k) values[li].append(val) x_attn = [] for h in range(n_head): hs = h * head_dim q_h = q[hs:hs+head_dim] k_h = [ki[hs:hs+head_dim] for ki in keys[li]] v_h = [vi[hs:hs+head_dim] for vi in values[li]] attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))] attn_weights = softmax(attn_logits) head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)] x_attn.extend(head_out) x = linear(x_attn, state_dict[f'layer{li}.attn_wo']) x = [a + b for a, b in zip(x, x_residual)] # 2) MLP block x_residual = x x = rmsnorm(x) x = linear(x, state_dict[f'layer{li}.mlp_fc1']) x = [xi.relu() ** 2 for xi in x] x = linear(x, state_dict[f'layer{li}.mlp_fc2']) x = [a + b for a, b in zip(x, x_residual)] # project to vocab (weight tying with wte) logits = linear(x, state_dict['wte']) return logits # Adam optimizer learning_rate = args.learning_rate beta1, beta2, eps_adam = 0.9, 0.95, 1e-8 m = [0.0] * len(params) # first moment v = [0.0] * len(params) # second moment # Training loop for step in range(args.num_steps): # Take a single training document, tokenize it, and crop to block_size doc = docs[step % len(docs)] tokens = [BOS] + [stoi[ch] for ch in doc] + [EOS] tokens = tokens[:block_size] # Forward pass through the document over time dimension keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] lossf = 0.0 for pos_id in range(len(tokens) - 1): logits = gpt(tokens[pos_id], pos_id, keys, values) probs = softmax(logits) loss = -probs[tokens[pos_id + 1]].log() loss = (1 / (len(tokens) - 1)) * loss # average over sequence length loss.backward() lossf += loss.data # Adam update (optimizer) lr_t = learning_rate * (1 - step / args.num_steps) for i, p in enumerate(params): m[i] = beta1 * m[i] + (1 - beta1) * p.grad v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 m_hat = m[i] / (1 - beta1 ** (step + 1)) v_hat = v[i] / (1 - beta2 ** (step + 1)) p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) p.grad = 0 print(f"step {step+1} / {args.num_steps} | loss {lossf:.4f}") # Inference: generate 5 samples print("\n--- generation ---") for sample_idx in range(5): keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] token_id = BOS generated = [] for pos_id in range(block_size): logits = gpt(token_id, pos_id, keys, values) probs = softmax(logits) token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0] if token_id == EOS: break generated.append(itos[token_id]) print(f"sample {sample_idx}: {''.join(generated)}")