Last active
April 25, 2025 09:15
-
-
Save Siwensun/8e90d720131b50f60a399f382ab30e76 to your computer and use it in GitHub Desktop.
Vision Transformer All in One [Numpy]
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| def get_len_mask(batch_size, seq_len): | |
| """Create attention mask (simplified for ViT where all patches are attended to)""" | |
| # In ViT, typically all patches are attended to (no padding) | |
| return np.zeros((batch_size, seq_len, seq_len), dtype=bool) | |
| def pos_sinusoid_embedding(seq_len, d_model): | |
| """Generate positional encoding""" | |
| embeddings = np.zeros((seq_len, d_model)) | |
| for i in range(d_model): | |
| if i % 2 == 0: | |
| embeddings[:, i] = np.sin(np.arange(0, seq_len) / np.power(10000, 2 * (i // 2) / d_model)) | |
| else: | |
| embeddings[:, i] = np.cos(np.arange(0, seq_len) / np.power(10000, 2 * (i // 2) / d_model)) | |
| return embeddings.astype(np.float32) | |
| class LayerNorm: | |
| def __init__(self, dim, eps=1e-6): | |
| self.gamma = np.ones(dim) | |
| self.beta = np.zeros(dim) | |
| self.eps = eps | |
| def forward(self, x): | |
| mean = np.mean(x, axis=-1, keepdims=True) | |
| var = np.var(x, axis=-1, keepdims=True) | |
| x_norm = (x - mean) / np.sqrt(var + self.eps) | |
| return self.gamma * x_norm + self.beta | |
| class MultiHeadAttention: | |
| def __init__(self, d_model, num_heads, dropout_rate=0.0): | |
| self.d_model = d_model | |
| self.num_heads = num_heads | |
| self.head_dim = d_model // num_heads | |
| self.dropout_rate = dropout_rate | |
| # Linear projections with Xavier/Glorot initialization | |
| scale = np.sqrt(2.0 / (d_model + self.head_dim)) | |
| self.W_Q = np.random.normal(0, scale, (d_model, d_model)) | |
| self.W_K = np.random.normal(0, scale, (d_model, d_model)) | |
| self.W_V = np.random.normal(0, scale, (d_model, d_model)) | |
| self.W_out = np.random.normal(0, scale, (d_model, d_model)) | |
| # Biases | |
| self.b_Q = np.zeros(d_model) | |
| self.b_K = np.zeros(d_model) | |
| self.b_V = np.zeros(d_model) | |
| self.b_out = np.zeros(d_model) | |
| def split_heads(self, x): | |
| """Split the last dimension into (num_heads, head_dim)""" | |
| batch_size, seq_len = x.shape[0], x.shape[1] | |
| x = x.reshape(batch_size, seq_len, self.num_heads, self.head_dim) | |
| return x | |
| def forward(self, Q, K, V, mask=None): | |
| batch_size = Q.shape[0] | |
| q_len, k_len, v_len = Q.shape[1], K.shape[1], V.shape[1] | |
| # Linear projections | |
| Q_proj = np.dot(Q, self.W_Q) + self.b_Q # (batch_size, q_len, d_model) | |
| K_proj = np.dot(K, self.W_K) + self.b_K # (batch_size, k_len, d_model) | |
| V_proj = np.dot(V, self.W_V) + self.b_V # (batch_size, v_len, d_model) | |
| # Split heads | |
| Q_proj = self.split_heads(Q_proj) # (batch_size, q_len, num_heads, head_dim) | |
| K_proj = self.split_heads(K_proj) # (batch_size, k_len, num_heads, head_dim) | |
| V_proj = self.split_heads(V_proj) # (batch_size, v_len, num_heads, head_dim) | |
| # Transpose for batch matrix multiplication | |
| Q_proj = np.transpose(Q_proj, (0, 2, 1, 3)) # (batch_size, num_heads, q_len, head_dim) | |
| K_proj = np.transpose(K_proj, (0, 2, 1, 3)) # (batch_size, num_heads, k_len, head_dim) | |
| V_proj = np.transpose(V_proj, (0, 2, 1, 3)) # (batch_size, num_heads, v_len, head_dim) | |
| # Calculate attention scores | |
| scores = np.matmul(Q_proj, np.transpose(K_proj, (0, 1, 3, 2))) / np.sqrt(self.head_dim) | |
| # (batch_size, num_heads, q_len, k_len) | |
| # Apply mask if provided | |
| if mask is not None: | |
| scores = np.where(mask[:, np.newaxis], -1e9, scores) | |
| # Apply softmax | |
| attention_weights = self._softmax(scores) | |
| # Apply dropout (simplified for NumPy) | |
| if self.dropout_rate > 0: | |
| dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, attention_weights.shape) | |
| attention_weights = attention_weights * dropout_mask / (1 - self.dropout_rate) | |
| # Calculate context vectors | |
| context = np.matmul(attention_weights, V_proj) # (batch_size, num_heads, q_len, head_dim) | |
| # Reshape back | |
| context = np.transpose(context, (0, 2, 1, 3)) # (batch_size, q_len, num_heads, head_dim) | |
| context = context.reshape(batch_size, q_len, self.d_model) | |
| # Final linear layer | |
| output = np.dot(context, self.W_out) + self.b_out | |
| return output | |
| def _softmax(self, x): | |
| """Compute softmax along the last dimension""" | |
| # Shift for numerical stability | |
| e_x = np.exp(x - np.max(x, axis=-1, keepdims=True)) | |
| return e_x / np.sum(e_x, axis=-1, keepdims=True) | |
| class PositionwiseFeedForward: | |
| def __init__(self, d_model, d_ff, dropout_rate=0.0): | |
| self.d_model = d_model | |
| self.d_ff = d_ff | |
| self.dropout_rate = dropout_rate | |
| # Weights for the two layers | |
| scale1 = np.sqrt(2.0 / (d_model + d_ff)) | |
| scale2 = np.sqrt(2.0 / (d_ff + d_model)) | |
| self.W1 = np.random.normal(0, scale1, (d_model, d_ff)) | |
| self.b1 = np.zeros(d_ff) | |
| self.W2 = np.random.normal(0, scale2, (d_ff, d_model)) | |
| self.b2 = np.zeros(d_model) | |
| def forward(self, x): | |
| # First dense layer with GELU activation | |
| hidden = np.dot(x, self.W1) + self.b1 | |
| hidden = self.gelu(hidden) | |
| # Second dense layer | |
| output = np.dot(hidden, self.W2) + self.b2 | |
| # Dropout | |
| if self.dropout_rate > 0: | |
| dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, output.shape) | |
| output = output * dropout_mask / (1 - self.dropout_rate) | |
| return output | |
| def gelu(self, x): | |
| """Gaussian Error Linear Unit activation""" | |
| return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3))) | |
| class EncoderLayer: | |
| def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1): | |
| # Multi-head attention | |
| self.mha = MultiHeadAttention(d_model, num_heads, dropout_rate) | |
| # Position-wise feed-forward network | |
| self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout_rate) | |
| # Layer normalization | |
| self.norm1 = LayerNorm(d_model) | |
| self.norm2 = LayerNorm(d_model) | |
| # Dropout | |
| self.dropout_rate = dropout_rate | |
| def forward(self, x, mask=None): | |
| # Pre-LayerNorm architecture | |
| # Multi-head attention block | |
| norm_x = self.norm1.forward(x) | |
| attention_output = self.mha.forward(norm_x, norm_x, norm_x, mask) | |
| # Apply dropout and residual connection | |
| if self.dropout_rate > 0: | |
| dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, attention_output.shape) | |
| attention_output = attention_output * dropout_mask / (1 - self.dropout_rate) | |
| x = x + attention_output | |
| # Feed-forward block | |
| norm_x = self.norm2.forward(x) | |
| ffn_output = self.ffn.forward(norm_x) | |
| # Apply dropout and residual connection | |
| if self.dropout_rate > 0: | |
| dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, ffn_output.shape) | |
| ffn_output = ffn_output * dropout_mask / (1 - self.dropout_rate) | |
| x = x + ffn_output | |
| return x | |
| class PatchEmbedding: | |
| """ | |
| Convert images into patches and project to embedding dimension. | |
| This is the first layer of Vision Transformer. | |
| """ | |
| def __init__(self, img_size, patch_size, in_channels, embed_dim): | |
| """ | |
| Args: | |
| img_size: Size of the input image (H, W) | |
| patch_size: Size of each patch (P, P) | |
| in_channels: Number of input channels (C) | |
| embed_dim: Dimension of the token embeddings (D) | |
| """ | |
| self.img_size = img_size | |
| self.patch_size = patch_size | |
| self.in_channels = in_channels | |
| self.embed_dim = embed_dim | |
| # Number of patches | |
| self.num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size) | |
| # Projection layer: transforms patches into embedding dimension | |
| # Equivalent to a Conv2D with kernel size = patch size and stride = patch size | |
| scale = np.sqrt(2.0 / (patch_size * patch_size * in_channels + embed_dim)) | |
| self.proj_weight = np.random.normal(0, scale, (embed_dim, in_channels, patch_size, patch_size)) | |
| self.proj_bias = np.zeros(embed_dim) | |
| def forward(self, x): | |
| """ | |
| Args: | |
| x: Input images [batch_size, in_channels, height, width] | |
| Returns: | |
| Patch embeddings [batch_size, num_patches, embed_dim] | |
| """ | |
| batch_size, _, height, width = x.shape | |
| # Check image size | |
| assert height == self.img_size[0] and width == self.img_size[1], \ | |
| f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})" | |
| # Extract patches from image | |
| patches = self.extract_patches(x) # [batch_size, num_patches, in_channels*patch_size*patch_size] | |
| # Project patches to embedding dimension | |
| patch_embeddings = self.project_patches(patches) # [batch_size, num_patches, embed_dim] | |
| return patch_embeddings | |
| def extract_patches(self, x): | |
| """Extract patches from the input images""" | |
| batch_size, channels, height, width = x.shape | |
| patch_size = self.patch_size | |
| # Calculate the number of patches in each dimension | |
| num_patches_h = height // patch_size | |
| num_patches_w = width // patch_size | |
| # Extract patches | |
| patches = [] | |
| for i in range(num_patches_h): | |
| for j in range(num_patches_w): | |
| h_start, h_end = i * patch_size, (i + 1) * patch_size | |
| w_start, w_end = j * patch_size, (j + 1) * patch_size | |
| # Extract patch [batch_size, channels, patch_size, patch_size] | |
| patch = x[:, :, h_start:h_end, w_start:w_end] | |
| # Flatten the patch [batch_size, channels*patch_size*patch_size] | |
| patch_flat = patch.reshape(batch_size, -1) | |
| patches.append(patch_flat) | |
| # Stack patches [batch_size, num_patches, channels*patch_size*patch_size] | |
| patches = np.stack(patches, axis=1) | |
| return patches | |
| def project_patches(self, patches): | |
| """Project patches to embedding dimension""" | |
| batch_size, num_patches, patch_dim = patches.shape | |
| # Reshape projection weight for matrix multiplication | |
| proj_weight_reshaped = self.proj_weight.reshape(self.embed_dim, -1) | |
| # Project patches [batch_size, num_patches, embed_dim] | |
| patch_embeddings = np.zeros((batch_size, num_patches, self.embed_dim)) | |
| for b in range(batch_size): | |
| # [num_patches, patch_dim] @ [patch_dim, embed_dim] = [num_patches, embed_dim] | |
| patch_embeddings[b] = np.dot(patches[b], proj_weight_reshaped.T) + self.proj_bias | |
| return patch_embeddings | |
| class VisionTransformer: | |
| """ | |
| Vision Transformer (ViT) model. | |
| """ | |
| def __init__( | |
| self, | |
| img_size=(224, 224), | |
| patch_size=16, | |
| in_channels=3, | |
| num_classes=1000, | |
| embed_dim=768, | |
| depth=12, | |
| num_heads=12, | |
| mlp_ratio=4.0, | |
| dropout_rate=0.1, | |
| attn_dropout_rate=0.0 | |
| ): | |
| """ | |
| Args: | |
| img_size: Size of the input image (height, width) | |
| patch_size: Size of each patch | |
| in_channels: Number of input channels | |
| num_classes: Number of classes for classification | |
| embed_dim: Dimension of the token embeddings | |
| depth: Number of transformer blocks | |
| num_heads: Number of attention heads | |
| mlp_ratio: Ratio of mlp hidden dim to embedding dim | |
| dropout_rate: Dropout rate | |
| attn_dropout_rate: Attention dropout rate | |
| """ | |
| self.img_size = img_size | |
| self.patch_size = patch_size | |
| self.in_channels = in_channels | |
| self.num_classes = num_classes | |
| self.embed_dim = embed_dim | |
| self.depth = depth | |
| self.num_heads = num_heads | |
| self.mlp_dim = int(embed_dim * mlp_ratio) | |
| self.dropout_rate = dropout_rate | |
| self.attn_dropout_rate = attn_dropout_rate | |
| # Patch embedding layer | |
| self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim) | |
| num_patches = self.patch_embed.num_patches | |
| # Class token (learnable) | |
| self.cls_token = np.random.normal(0, 0.02, (1, 1, embed_dim)) | |
| # Position embeddings (learnable) | |
| # +1 for the class token | |
| self.pos_embedding = np.random.normal(0, 0.02, (1, num_patches + 1, embed_dim)) | |
| # Dropout | |
| self.dropout_rate = dropout_rate | |
| # Transformer encoder layers | |
| self.encoder_layers = [] | |
| for _ in range(depth): | |
| layer = EncoderLayer( | |
| d_model=embed_dim, | |
| num_heads=num_heads, | |
| d_ff=self.mlp_dim, | |
| dropout_rate=dropout_rate | |
| ) | |
| self.encoder_layers.append(layer) | |
| # Layer norm | |
| self.norm = LayerNorm(embed_dim) | |
| # Classification head | |
| scale = np.sqrt(2.0 / (embed_dim + num_classes)) | |
| self.head_weight = np.random.normal(0, scale, (embed_dim, num_classes)) | |
| self.head_bias = np.zeros(num_classes) | |
| def forward(self, x): | |
| """ | |
| Args: | |
| x: Input images [batch_size, in_channels, height, width] | |
| Returns: | |
| Classification logits [batch_size, num_classes] | |
| """ | |
| batch_size = x.shape[0] | |
| # Create patch embeddings | |
| x = self.patch_embed.forward(x) # [batch_size, num_patches, embed_dim] | |
| # Add class token to beginning of sequence | |
| cls_tokens = np.repeat(self.cls_token, batch_size, axis=0) # [batch_size, 1, embed_dim] | |
| x = np.concatenate([cls_tokens, x], axis=1) # [batch_size, 1 + num_patches, embed_dim] | |
| # Add position embeddings | |
| x = x + self.pos_embedding # [batch_size, 1 + num_patches, embed_dim] | |
| # Apply dropout | |
| if self.dropout_rate > 0: | |
| dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, x.shape) | |
| x = x * dropout_mask / (1 - self.dropout_rate) | |
| # Apply transformer encoder | |
| for layer in self.encoder_layers: | |
| x = layer.forward(x) | |
| # Layer normalization | |
| x = self.norm.forward(x) | |
| # Take the output of the class token for classification | |
| x = x[:, 0] # [batch_size, embed_dim] | |
| # Classification head | |
| logits = np.dot(x, self.head_weight) + self.head_bias # [batch_size, num_classes] | |
| return logits | |
| def create_sample_image_batch(batch_size=1, img_size=(224, 224), channels=3): | |
| """Create a sample batch of images for testing""" | |
| return np.random.random((batch_size, channels, img_size[0], img_size[1])) | |
| def test_vision_transformer(): | |
| """Test the Vision Transformer implementation with a sample input""" | |
| # Create a small ViT model for testing | |
| model = VisionTransformer( | |
| img_size=(224, 224), | |
| patch_size=16, | |
| in_channels=3, | |
| num_classes=10, | |
| embed_dim=192, | |
| depth=4, | |
| num_heads=3, | |
| mlp_ratio=4.0, | |
| dropout_rate=0.1 | |
| ) | |
| # Create a sample batch of images | |
| batch_size = 2 | |
| images = create_sample_image_batch(batch_size, (224, 224), 3) | |
| # Forward pass | |
| logits = model.forward(images) | |
| print(f"Input shape: {images.shape}") | |
| print(f"Output shape: {logits.shape}") | |
| print(f"Expected output shape: (batch_size, num_classes) = ({batch_size}, {model.num_classes})") | |
| return logits | |
| if __name__ == "__main__": | |
| test_vision_transformer() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment