Skip to content

Instantly share code, notes, and snippets.

@Siwensun
Last active April 25, 2025 09:15
Show Gist options
  • Select an option

  • Save Siwensun/8e90d720131b50f60a399f382ab30e76 to your computer and use it in GitHub Desktop.

Select an option

Save Siwensun/8e90d720131b50f60a399f382ab30e76 to your computer and use it in GitHub Desktop.
Vision Transformer All in One [Numpy]
import numpy as np
def get_len_mask(batch_size, seq_len):
"""Create attention mask (simplified for ViT where all patches are attended to)"""
# In ViT, typically all patches are attended to (no padding)
return np.zeros((batch_size, seq_len, seq_len), dtype=bool)
def pos_sinusoid_embedding(seq_len, d_model):
"""Generate positional encoding"""
embeddings = np.zeros((seq_len, d_model))
for i in range(d_model):
if i % 2 == 0:
embeddings[:, i] = np.sin(np.arange(0, seq_len) / np.power(10000, 2 * (i // 2) / d_model))
else:
embeddings[:, i] = np.cos(np.arange(0, seq_len) / np.power(10000, 2 * (i // 2) / d_model))
return embeddings.astype(np.float32)
class LayerNorm:
def __init__(self, dim, eps=1e-6):
self.gamma = np.ones(dim)
self.beta = np.zeros(dim)
self.eps = eps
def forward(self, x):
mean = np.mean(x, axis=-1, keepdims=True)
var = np.var(x, axis=-1, keepdims=True)
x_norm = (x - mean) / np.sqrt(var + self.eps)
return self.gamma * x_norm + self.beta
class MultiHeadAttention:
def __init__(self, d_model, num_heads, dropout_rate=0.0):
self.d_model = d_model
self.num_heads = num_heads
self.head_dim = d_model // num_heads
self.dropout_rate = dropout_rate
# Linear projections with Xavier/Glorot initialization
scale = np.sqrt(2.0 / (d_model + self.head_dim))
self.W_Q = np.random.normal(0, scale, (d_model, d_model))
self.W_K = np.random.normal(0, scale, (d_model, d_model))
self.W_V = np.random.normal(0, scale, (d_model, d_model))
self.W_out = np.random.normal(0, scale, (d_model, d_model))
# Biases
self.b_Q = np.zeros(d_model)
self.b_K = np.zeros(d_model)
self.b_V = np.zeros(d_model)
self.b_out = np.zeros(d_model)
def split_heads(self, x):
"""Split the last dimension into (num_heads, head_dim)"""
batch_size, seq_len = x.shape[0], x.shape[1]
x = x.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
return x
def forward(self, Q, K, V, mask=None):
batch_size = Q.shape[0]
q_len, k_len, v_len = Q.shape[1], K.shape[1], V.shape[1]
# Linear projections
Q_proj = np.dot(Q, self.W_Q) + self.b_Q # (batch_size, q_len, d_model)
K_proj = np.dot(K, self.W_K) + self.b_K # (batch_size, k_len, d_model)
V_proj = np.dot(V, self.W_V) + self.b_V # (batch_size, v_len, d_model)
# Split heads
Q_proj = self.split_heads(Q_proj) # (batch_size, q_len, num_heads, head_dim)
K_proj = self.split_heads(K_proj) # (batch_size, k_len, num_heads, head_dim)
V_proj = self.split_heads(V_proj) # (batch_size, v_len, num_heads, head_dim)
# Transpose for batch matrix multiplication
Q_proj = np.transpose(Q_proj, (0, 2, 1, 3)) # (batch_size, num_heads, q_len, head_dim)
K_proj = np.transpose(K_proj, (0, 2, 1, 3)) # (batch_size, num_heads, k_len, head_dim)
V_proj = np.transpose(V_proj, (0, 2, 1, 3)) # (batch_size, num_heads, v_len, head_dim)
# Calculate attention scores
scores = np.matmul(Q_proj, np.transpose(K_proj, (0, 1, 3, 2))) / np.sqrt(self.head_dim)
# (batch_size, num_heads, q_len, k_len)
# Apply mask if provided
if mask is not None:
scores = np.where(mask[:, np.newaxis], -1e9, scores)
# Apply softmax
attention_weights = self._softmax(scores)
# Apply dropout (simplified for NumPy)
if self.dropout_rate > 0:
dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, attention_weights.shape)
attention_weights = attention_weights * dropout_mask / (1 - self.dropout_rate)
# Calculate context vectors
context = np.matmul(attention_weights, V_proj) # (batch_size, num_heads, q_len, head_dim)
# Reshape back
context = np.transpose(context, (0, 2, 1, 3)) # (batch_size, q_len, num_heads, head_dim)
context = context.reshape(batch_size, q_len, self.d_model)
# Final linear layer
output = np.dot(context, self.W_out) + self.b_out
return output
def _softmax(self, x):
"""Compute softmax along the last dimension"""
# Shift for numerical stability
e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
return e_x / np.sum(e_x, axis=-1, keepdims=True)
class PositionwiseFeedForward:
def __init__(self, d_model, d_ff, dropout_rate=0.0):
self.d_model = d_model
self.d_ff = d_ff
self.dropout_rate = dropout_rate
# Weights for the two layers
scale1 = np.sqrt(2.0 / (d_model + d_ff))
scale2 = np.sqrt(2.0 / (d_ff + d_model))
self.W1 = np.random.normal(0, scale1, (d_model, d_ff))
self.b1 = np.zeros(d_ff)
self.W2 = np.random.normal(0, scale2, (d_ff, d_model))
self.b2 = np.zeros(d_model)
def forward(self, x):
# First dense layer with GELU activation
hidden = np.dot(x, self.W1) + self.b1
hidden = self.gelu(hidden)
# Second dense layer
output = np.dot(hidden, self.W2) + self.b2
# Dropout
if self.dropout_rate > 0:
dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, output.shape)
output = output * dropout_mask / (1 - self.dropout_rate)
return output
def gelu(self, x):
"""Gaussian Error Linear Unit activation"""
return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))
class EncoderLayer:
def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
# Multi-head attention
self.mha = MultiHeadAttention(d_model, num_heads, dropout_rate)
# Position-wise feed-forward network
self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout_rate)
# Layer normalization
self.norm1 = LayerNorm(d_model)
self.norm2 = LayerNorm(d_model)
# Dropout
self.dropout_rate = dropout_rate
def forward(self, x, mask=None):
# Pre-LayerNorm architecture
# Multi-head attention block
norm_x = self.norm1.forward(x)
attention_output = self.mha.forward(norm_x, norm_x, norm_x, mask)
# Apply dropout and residual connection
if self.dropout_rate > 0:
dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, attention_output.shape)
attention_output = attention_output * dropout_mask / (1 - self.dropout_rate)
x = x + attention_output
# Feed-forward block
norm_x = self.norm2.forward(x)
ffn_output = self.ffn.forward(norm_x)
# Apply dropout and residual connection
if self.dropout_rate > 0:
dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, ffn_output.shape)
ffn_output = ffn_output * dropout_mask / (1 - self.dropout_rate)
x = x + ffn_output
return x
class PatchEmbedding:
"""
Convert images into patches and project to embedding dimension.
This is the first layer of Vision Transformer.
"""
def __init__(self, img_size, patch_size, in_channels, embed_dim):
"""
Args:
img_size: Size of the input image (H, W)
patch_size: Size of each patch (P, P)
in_channels: Number of input channels (C)
embed_dim: Dimension of the token embeddings (D)
"""
self.img_size = img_size
self.patch_size = patch_size
self.in_channels = in_channels
self.embed_dim = embed_dim
# Number of patches
self.num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size)
# Projection layer: transforms patches into embedding dimension
# Equivalent to a Conv2D with kernel size = patch size and stride = patch size
scale = np.sqrt(2.0 / (patch_size * patch_size * in_channels + embed_dim))
self.proj_weight = np.random.normal(0, scale, (embed_dim, in_channels, patch_size, patch_size))
self.proj_bias = np.zeros(embed_dim)
def forward(self, x):
"""
Args:
x: Input images [batch_size, in_channels, height, width]
Returns:
Patch embeddings [batch_size, num_patches, embed_dim]
"""
batch_size, _, height, width = x.shape
# Check image size
assert height == self.img_size[0] and width == self.img_size[1], \
f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})"
# Extract patches from image
patches = self.extract_patches(x) # [batch_size, num_patches, in_channels*patch_size*patch_size]
# Project patches to embedding dimension
patch_embeddings = self.project_patches(patches) # [batch_size, num_patches, embed_dim]
return patch_embeddings
def extract_patches(self, x):
"""Extract patches from the input images"""
batch_size, channels, height, width = x.shape
patch_size = self.patch_size
# Calculate the number of patches in each dimension
num_patches_h = height // patch_size
num_patches_w = width // patch_size
# Extract patches
patches = []
for i in range(num_patches_h):
for j in range(num_patches_w):
h_start, h_end = i * patch_size, (i + 1) * patch_size
w_start, w_end = j * patch_size, (j + 1) * patch_size
# Extract patch [batch_size, channels, patch_size, patch_size]
patch = x[:, :, h_start:h_end, w_start:w_end]
# Flatten the patch [batch_size, channels*patch_size*patch_size]
patch_flat = patch.reshape(batch_size, -1)
patches.append(patch_flat)
# Stack patches [batch_size, num_patches, channels*patch_size*patch_size]
patches = np.stack(patches, axis=1)
return patches
def project_patches(self, patches):
"""Project patches to embedding dimension"""
batch_size, num_patches, patch_dim = patches.shape
# Reshape projection weight for matrix multiplication
proj_weight_reshaped = self.proj_weight.reshape(self.embed_dim, -1)
# Project patches [batch_size, num_patches, embed_dim]
patch_embeddings = np.zeros((batch_size, num_patches, self.embed_dim))
for b in range(batch_size):
# [num_patches, patch_dim] @ [patch_dim, embed_dim] = [num_patches, embed_dim]
patch_embeddings[b] = np.dot(patches[b], proj_weight_reshaped.T) + self.proj_bias
return patch_embeddings
class VisionTransformer:
"""
Vision Transformer (ViT) model.
"""
def __init__(
self,
img_size=(224, 224),
patch_size=16,
in_channels=3,
num_classes=1000,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4.0,
dropout_rate=0.1,
attn_dropout_rate=0.0
):
"""
Args:
img_size: Size of the input image (height, width)
patch_size: Size of each patch
in_channels: Number of input channels
num_classes: Number of classes for classification
embed_dim: Dimension of the token embeddings
depth: Number of transformer blocks
num_heads: Number of attention heads
mlp_ratio: Ratio of mlp hidden dim to embedding dim
dropout_rate: Dropout rate
attn_dropout_rate: Attention dropout rate
"""
self.img_size = img_size
self.patch_size = patch_size
self.in_channels = in_channels
self.num_classes = num_classes
self.embed_dim = embed_dim
self.depth = depth
self.num_heads = num_heads
self.mlp_dim = int(embed_dim * mlp_ratio)
self.dropout_rate = dropout_rate
self.attn_dropout_rate = attn_dropout_rate
# Patch embedding layer
self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
num_patches = self.patch_embed.num_patches
# Class token (learnable)
self.cls_token = np.random.normal(0, 0.02, (1, 1, embed_dim))
# Position embeddings (learnable)
# +1 for the class token
self.pos_embedding = np.random.normal(0, 0.02, (1, num_patches + 1, embed_dim))
# Dropout
self.dropout_rate = dropout_rate
# Transformer encoder layers
self.encoder_layers = []
for _ in range(depth):
layer = EncoderLayer(
d_model=embed_dim,
num_heads=num_heads,
d_ff=self.mlp_dim,
dropout_rate=dropout_rate
)
self.encoder_layers.append(layer)
# Layer norm
self.norm = LayerNorm(embed_dim)
# Classification head
scale = np.sqrt(2.0 / (embed_dim + num_classes))
self.head_weight = np.random.normal(0, scale, (embed_dim, num_classes))
self.head_bias = np.zeros(num_classes)
def forward(self, x):
"""
Args:
x: Input images [batch_size, in_channels, height, width]
Returns:
Classification logits [batch_size, num_classes]
"""
batch_size = x.shape[0]
# Create patch embeddings
x = self.patch_embed.forward(x) # [batch_size, num_patches, embed_dim]
# Add class token to beginning of sequence
cls_tokens = np.repeat(self.cls_token, batch_size, axis=0) # [batch_size, 1, embed_dim]
x = np.concatenate([cls_tokens, x], axis=1) # [batch_size, 1 + num_patches, embed_dim]
# Add position embeddings
x = x + self.pos_embedding # [batch_size, 1 + num_patches, embed_dim]
# Apply dropout
if self.dropout_rate > 0:
dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, x.shape)
x = x * dropout_mask / (1 - self.dropout_rate)
# Apply transformer encoder
for layer in self.encoder_layers:
x = layer.forward(x)
# Layer normalization
x = self.norm.forward(x)
# Take the output of the class token for classification
x = x[:, 0] # [batch_size, embed_dim]
# Classification head
logits = np.dot(x, self.head_weight) + self.head_bias # [batch_size, num_classes]
return logits
def create_sample_image_batch(batch_size=1, img_size=(224, 224), channels=3):
"""Create a sample batch of images for testing"""
return np.random.random((batch_size, channels, img_size[0], img_size[1]))
def test_vision_transformer():
"""Test the Vision Transformer implementation with a sample input"""
# Create a small ViT model for testing
model = VisionTransformer(
img_size=(224, 224),
patch_size=16,
in_channels=3,
num_classes=10,
embed_dim=192,
depth=4,
num_heads=3,
mlp_ratio=4.0,
dropout_rate=0.1
)
# Create a sample batch of images
batch_size = 2
images = create_sample_image_batch(batch_size, (224, 224), 3)
# Forward pass
logits = model.forward(images)
print(f"Input shape: {images.shape}")
print(f"Output shape: {logits.shape}")
print(f"Expected output shape: (batch_size, num_classes) = ({batch_size}, {model.num_classes})")
return logits
if __name__ == "__main__":
test_vision_transformer()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment