Siwensun · April 25, 2025 09:15
diff --git a/vit.py b/vit.py
 import numpy as np


 def get_len_mask(batch_size, seq_len):
    """Create attention mask (simplified for ViT where all patches are attended to)"""
    # In ViT, typically all patches are attended to (no padding)
    return np.zeros((batch_size, seq_len, seq_len), dtype=bool)


 def pos_sinusoid_embedding(seq_len, d_model):
    """Generate positional encoding"""
    embeddings = np.zeros((seq_len, d_model))
    for i in range(d_model):
        if i % 2 == 0:
            embeddings[:, i] = np.sin(np.arange(0, seq_len) / np.power(10000, 2 * (i // 2) / d_model))
        else:
            embeddings[:, i] = np.cos(np.arange(0, seq_len) / np.power(10000, 2 * (i // 2) / d_model))
    return embeddings.astype(np.float32)


 class LayerNorm:
    def __init__(self, dim, eps=1e-6):
        self.gamma = np.ones(dim)
        self.beta = np.zeros(dim)
        self.eps = eps
        
    def forward(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)
        x_norm = (x - mean) / np.sqrt(var + self.eps)
        return self.gamma * x_norm + self.beta


 class MultiHeadAttention:
    def __init__(self, d_model, num_heads, dropout_rate=0.0):
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.dropout_rate = dropout_rate
        
        # Linear projections with Xavier/Glorot initialization
        scale = np.sqrt(2.0 / (d_model + self.head_dim))
        self.W_Q = np.random.normal(0, scale, (d_model, d_model))
        self.W_K = np.random.normal(0, scale, (d_model, d_model))
        self.W_V = np.random.normal(0, scale, (d_model, d_model))
        self.W_out = np.random.normal(0, scale, (d_model, d_model))
        
        # Biases
        self.b_Q = np.zeros(d_model)
        self.b_K = np.zeros(d_model)
        self.b_V = np.zeros(d_model)
        self.b_out = np.zeros(d_model)
    
    def split_heads(self, x):
        """Split the last dimension into (num_heads, head_dim)"""
        batch_size, seq_len = x.shape[0], x.shape[1]
        x = x.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
        return x
    
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.shape[0]
        q_len, k_len, v_len = Q.shape[1], K.shape[1], V.shape[1]
        
        # Linear projections
        Q_proj = np.dot(Q, self.W_Q) + self.b_Q  # (batch_size, q_len, d_model)
        K_proj = np.dot(K, self.W_K) + self.b_K  # (batch_size, k_len, d_model)
        V_proj = np.dot(V, self.W_V) + self.b_V  # (batch_size, v_len, d_model)
        
        # Split heads
        Q_proj = self.split_heads(Q_proj)  # (batch_size, q_len, num_heads, head_dim)
        K_proj = self.split_heads(K_proj)  # (batch_size, k_len, num_heads, head_dim)
        V_proj = self.split_heads(V_proj)  # (batch_size, v_len, num_heads, head_dim)
        
        # Transpose for batch matrix multiplication
        Q_proj = np.transpose(Q_proj, (0, 2, 1, 3))  # (batch_size, num_heads, q_len, head_dim)
        K_proj = np.transpose(K_proj, (0, 2, 1, 3))  # (batch_size, num_heads, k_len, head_dim)
        V_proj = np.transpose(V_proj, (0, 2, 1, 3))  # (batch_size, num_heads, v_len, head_dim)
        
        # Calculate attention scores
        scores = np.matmul(Q_proj, np.transpose(K_proj, (0, 1, 3, 2))) / np.sqrt(self.head_dim)
        # (batch_size, num_heads, q_len, k_len)
        
        # Apply mask if provided
        if mask is not None:
            scores = np.where(mask[:, np.newaxis], -1e9, scores)
        
        # Apply softmax
        attention_weights = self._softmax(scores)
        
        # Apply dropout (simplified for NumPy)
        if self.dropout_rate > 0:
            dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, attention_weights.shape)
            attention_weights = attention_weights * dropout_mask / (1 - self.dropout_rate)
        
        # Calculate context vectors
        context = np.matmul(attention_weights, V_proj)  # (batch_size, num_heads, q_len, head_dim)
        
        # Reshape back
        context = np.transpose(context, (0, 2, 1, 3))  # (batch_size, q_len, num_heads, head_dim)
        context = context.reshape(batch_size, q_len, self.d_model)
        
        # Final linear layer
        output = np.dot(context, self.W_out) + self.b_out
        
        return output
    
    def _softmax(self, x):
        """Compute softmax along the last dimension"""
        # Shift for numerical stability
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return e_x / np.sum(e_x, axis=-1, keepdims=True)


 class PositionwiseFeedForward:
    def __init__(self, d_model, d_ff, dropout_rate=0.0):
        self.d_model = d_model
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate
        
        # Weights for the two layers
        scale1 = np.sqrt(2.0 / (d_model + d_ff))
        scale2 = np.sqrt(2.0 / (d_ff + d_model))
        self.W1 = np.random.normal(0, scale1, (d_model, d_ff))
        self.b1 = np.zeros(d_ff)
        self.W2 = np.random.normal(0, scale2, (d_ff, d_model))
        self.b2 = np.zeros(d_model)
    
    def forward(self, x):
        # First dense layer with GELU activation
        hidden = np.dot(x, self.W1) + self.b1
        hidden = self.gelu(hidden)
        
        # Second dense layer
        output = np.dot(hidden, self.W2) + self.b2
        
        # Dropout
        if self.dropout_rate > 0:
            dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, output.shape)
            output = output * dropout_mask / (1 - self.dropout_rate)
        
        return output
    
    def gelu(self, x):
        """Gaussian Error Linear Unit activation"""
        return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))


 class EncoderLayer:
    def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
        # Multi-head attention
        self.mha = MultiHeadAttention(d_model, num_heads, dropout_rate)
        
        # Position-wise feed-forward network
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout_rate)
        
        # Layer normalization
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        
        # Dropout
        self.dropout_rate = dropout_rate
    
    def forward(self, x, mask=None):
        # Pre-LayerNorm architecture
        # Multi-head attention block
        norm_x = self.norm1.forward(x)
        attention_output = self.mha.forward(norm_x, norm_x, norm_x, mask)
        
        # Apply dropout and residual connection
        if self.dropout_rate > 0:
            dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, attention_output.shape)
            attention_output = attention_output * dropout_mask / (1 - self.dropout_rate)
        x = x + attention_output
        
        # Feed-forward block
        norm_x = self.norm2.forward(x)
        ffn_output = self.ffn.forward(norm_x)
        
        # Apply dropout and residual connection
        if self.dropout_rate > 0:
            dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, ffn_output.shape)
            ffn_output = ffn_output * dropout_mask / (1 - self.dropout_rate)
        x = x + ffn_output
        
        return x


 class PatchEmbedding:
    """
    Convert images into patches and project to embedding dimension.
    This is the first layer of Vision Transformer.
    """
    def __init__(self, img_size, patch_size, in_channels, embed_dim):
        """
        Args:
            img_size: Size of the input image (H, W)
            patch_size: Size of each patch (P, P)
            in_channels: Number of input channels (C)
            embed_dim: Dimension of the token embeddings (D)
        """
        self.img_size = img_size
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.embed_dim = embed_dim
        
        # Number of patches
        self.num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size)
        
        # Projection layer: transforms patches into embedding dimension
        # Equivalent to a Conv2D with kernel size = patch size and stride = patch size
        scale = np.sqrt(2.0 / (patch_size * patch_size * in_channels + embed_dim))
        self.proj_weight = np.random.normal(0, scale, (embed_dim, in_channels, patch_size, patch_size))
        self.proj_bias = np.zeros(embed_dim)
    
    def forward(self, x):
        """
        Args:
            x: Input images [batch_size, in_channels, height, width]
        
        Returns:
            Patch embeddings [batch_size, num_patches, embed_dim]
        """
        batch_size, _, height, width = x.shape
        
        # Check image size
        assert height == self.img_size[0] and width == self.img_size[1], \
            f"Input image size ({height}*{width}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})"
        
        # Extract patches from image
        patches = self.extract_patches(x)  # [batch_size, num_patches, in_channels*patch_size*patch_size]
        
        # Project patches to embedding dimension
        patch_embeddings = self.project_patches(patches)  # [batch_size, num_patches, embed_dim]
        
        return patch_embeddings
    
    def extract_patches(self, x):
        """Extract patches from the input images"""
        batch_size, channels, height, width = x.shape
        patch_size = self.patch_size
        
        # Calculate the number of patches in each dimension
        num_patches_h = height // patch_size
        num_patches_w = width // patch_size
        
        # Extract patches
        patches = []
        for i in range(num_patches_h):
            for j in range(num_patches_w):
                h_start, h_end = i * patch_size, (i + 1) * patch_size
                w_start, w_end = j * patch_size, (j + 1) * patch_size
                
                # Extract patch [batch_size, channels, patch_size, patch_size]
                patch = x[:, :, h_start:h_end, w_start:w_end]
                
                # Flatten the patch [batch_size, channels*patch_size*patch_size]
                patch_flat = patch.reshape(batch_size, -1)
                
                patches.append(patch_flat)
        
        # Stack patches [batch_size, num_patches, channels*patch_size*patch_size]
        patches = np.stack(patches, axis=1)
        
        return patches
    
    def project_patches(self, patches):
        """Project patches to embedding dimension"""
        batch_size, num_patches, patch_dim = patches.shape
        
        # Reshape projection weight for matrix multiplication
        proj_weight_reshaped = self.proj_weight.reshape(self.embed_dim, -1)
        
        # Project patches [batch_size, num_patches, embed_dim]
        patch_embeddings = np.zeros((batch_size, num_patches, self.embed_dim))
        
        for b in range(batch_size):
            # [num_patches, patch_dim] @ [patch_dim, embed_dim] = [num_patches, embed_dim]
            patch_embeddings[b] = np.dot(patches[b], proj_weight_reshaped.T) + self.proj_bias
        
        return patch_embeddings


 class VisionTransformer:
    """
    Vision Transformer (ViT) model.
    """
    def __init__(
        self,
        img_size=(224, 224),
        patch_size=16,
        in_channels=3,
        num_classes=1000,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4.0,
        dropout_rate=0.1,
        attn_dropout_rate=0.0
    ):
        """
        Args:
            img_size: Size of the input image (height, width)
            patch_size: Size of each patch
            in_channels: Number of input channels
            num_classes: Number of classes for classification
            embed_dim: Dimension of the token embeddings
            depth: Number of transformer blocks
            num_heads: Number of attention heads
            mlp_ratio: Ratio of mlp hidden dim to embedding dim
            dropout_rate: Dropout rate
            attn_dropout_rate: Attention dropout rate
        """
        self.img_size = img_size
        self.patch_size = patch_size
        self.in_channels = in_channels
        self.num_classes = num_classes
        self.embed_dim = embed_dim
        self.depth = depth
        self.num_heads = num_heads
        self.mlp_dim = int(embed_dim * mlp_ratio)
        self.dropout_rate = dropout_rate
        self.attn_dropout_rate = attn_dropout_rate
        
        # Patch embedding layer
        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        num_patches = self.patch_embed.num_patches
        
        # Class token (learnable)
        self.cls_token = np.random.normal(0, 0.02, (1, 1, embed_dim))
        
        # Position embeddings (learnable)
        # +1 for the class token
        self.pos_embedding = np.random.normal(0, 0.02, (1, num_patches + 1, embed_dim))
        
        # Dropout
        self.dropout_rate = dropout_rate
        
        # Transformer encoder layers
        self.encoder_layers = []
        for _ in range(depth):
            layer = EncoderLayer(
                d_model=embed_dim,
                num_heads=num_heads,
                d_ff=self.mlp_dim,
                dropout_rate=dropout_rate
            )
            self.encoder_layers.append(layer)
        
        # Layer norm
        self.norm = LayerNorm(embed_dim)
        
        # Classification head
        scale = np.sqrt(2.0 / (embed_dim + num_classes))
        self.head_weight = np.random.normal(0, scale, (embed_dim, num_classes))
        self.head_bias = np.zeros(num_classes)
    
    def forward(self, x):
        """
        Args:
            x: Input images [batch_size, in_channels, height, width]
        
        Returns:
            Classification logits [batch_size, num_classes]
        """
        batch_size = x.shape[0]
        
        # Create patch embeddings
        x = self.patch_embed.forward(x)  # [batch_size, num_patches, embed_dim]
        
        # Add class token to beginning of sequence
        cls_tokens = np.repeat(self.cls_token, batch_size, axis=0)  # [batch_size, 1, embed_dim]
        x = np.concatenate([cls_tokens, x], axis=1)  # [batch_size, 1 + num_patches, embed_dim]
        
        # Add position embeddings
        x = x + self.pos_embedding  # [batch_size, 1 + num_patches, embed_dim]
        
        # Apply dropout
        if self.dropout_rate > 0:
            dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, x.shape)
            x = x * dropout_mask / (1 - self.dropout_rate)
        
        # Apply transformer encoder
        for layer in self.encoder_layers:
            x = layer.forward(x)
        
        # Layer normalization
        x = self.norm.forward(x)
        
        # Take the output of the class token for classification
        x = x[:, 0]  # [batch_size, embed_dim]
        
        # Classification head
        logits = np.dot(x, self.head_weight) + self.head_bias  # [batch_size, num_classes]
        
        return logits


 def create_sample_image_batch(batch_size=1, img_size=(224, 224), channels=3):
    """Create a sample batch of images for testing"""
    return np.random.random((batch_size, channels, img_size[0], img_size[1]))


 def test_vision_transformer():
    """Test the Vision Transformer implementation with a sample input"""
    # Create a small ViT model for testing
    model = VisionTransformer(
        img_size=(224, 224),
        patch_size=16,
        in_channels=3,
        num_classes=10,
        embed_dim=192,
        depth=4,
        num_heads=3,
        mlp_ratio=4.0,
        dropout_rate=0.1
    )
    
    # Create a sample batch of images
    batch_size = 2
    images = create_sample_image_batch(batch_size, (224, 224), 3)
    
    # Forward pass
    logits = model.forward(images)
    
    print(f"Input shape: {images.shape}")
    print(f"Output shape: {logits.shape}")
    print(f"Expected output shape: (batch_size, num_classes) = ({batch_size}, {model.num_classes})")
    
    return logits


 if __name__ == "__main__":
    test_vision_transformer()
	import numpy as np


	def get_len_mask(batch_size, seq_len):
	"""Create attention mask (simplified for ViT where all patches are attended to)"""
	# In ViT, typically all patches are attended to (no padding)
	return np.zeros((batch_size, seq_len, seq_len), dtype=bool)


	def pos_sinusoid_embedding(seq_len, d_model):
	"""Generate positional encoding"""
	embeddings = np.zeros((seq_len, d_model))
	for i in range(d_model):
	if i % 2 == 0:
	embeddings[:, i] = np.sin(np.arange(0, seq_len) / np.power(10000, 2 * (i // 2) / d_model))
	else:
	embeddings[:, i] = np.cos(np.arange(0, seq_len) / np.power(10000, 2 * (i // 2) / d_model))
	return embeddings.astype(np.float32)


	class LayerNorm:
	def __init__(self, dim, eps=1e-6):
	self.gamma = np.ones(dim)
	self.beta = np.zeros(dim)
	self.eps = eps

	def forward(self, x):
	mean = np.mean(x, axis=-1, keepdims=True)
	var = np.var(x, axis=-1, keepdims=True)
	x_norm = (x - mean) / np.sqrt(var + self.eps)
	return self.gamma * x_norm + self.beta


	class MultiHeadAttention:
	def __init__(self, d_model, num_heads, dropout_rate=0.0):
	self.d_model = d_model
	self.num_heads = num_heads
	self.head_dim = d_model // num_heads
	self.dropout_rate = dropout_rate

	# Linear projections with Xavier/Glorot initialization
	scale = np.sqrt(2.0 / (d_model + self.head_dim))
	self.W_Q = np.random.normal(0, scale, (d_model, d_model))
	self.W_K = np.random.normal(0, scale, (d_model, d_model))
	self.W_V = np.random.normal(0, scale, (d_model, d_model))
	self.W_out = np.random.normal(0, scale, (d_model, d_model))

	# Biases
	self.b_Q = np.zeros(d_model)
	self.b_K = np.zeros(d_model)
	self.b_V = np.zeros(d_model)
	self.b_out = np.zeros(d_model)

	def split_heads(self, x):
	"""Split the last dimension into (num_heads, head_dim)"""
	batch_size, seq_len = x.shape[0], x.shape[1]
	x = x.reshape(batch_size, seq_len, self.num_heads, self.head_dim)
	return x

	def forward(self, Q, K, V, mask=None):
	batch_size = Q.shape[0]
	q_len, k_len, v_len = Q.shape[1], K.shape[1], V.shape[1]

	# Linear projections
	Q_proj = np.dot(Q, self.W_Q) + self.b_Q # (batch_size, q_len, d_model)
	K_proj = np.dot(K, self.W_K) + self.b_K # (batch_size, k_len, d_model)
	V_proj = np.dot(V, self.W_V) + self.b_V # (batch_size, v_len, d_model)

	# Split heads
	Q_proj = self.split_heads(Q_proj) # (batch_size, q_len, num_heads, head_dim)
	K_proj = self.split_heads(K_proj) # (batch_size, k_len, num_heads, head_dim)
	V_proj = self.split_heads(V_proj) # (batch_size, v_len, num_heads, head_dim)

	# Transpose for batch matrix multiplication
	Q_proj = np.transpose(Q_proj, (0, 2, 1, 3)) # (batch_size, num_heads, q_len, head_dim)
	K_proj = np.transpose(K_proj, (0, 2, 1, 3)) # (batch_size, num_heads, k_len, head_dim)
	V_proj = np.transpose(V_proj, (0, 2, 1, 3)) # (batch_size, num_heads, v_len, head_dim)

	# Calculate attention scores
	scores = np.matmul(Q_proj, np.transpose(K_proj, (0, 1, 3, 2))) / np.sqrt(self.head_dim)
	# (batch_size, num_heads, q_len, k_len)

	# Apply mask if provided
	if mask is not None:
	scores = np.where(mask[:, np.newaxis], -1e9, scores)

	# Apply softmax
	attention_weights = self._softmax(scores)

	# Apply dropout (simplified for NumPy)
	if self.dropout_rate > 0:
	dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, attention_weights.shape)
	attention_weights = attention_weights * dropout_mask / (1 - self.dropout_rate)

	# Calculate context vectors
	context = np.matmul(attention_weights, V_proj) # (batch_size, num_heads, q_len, head_dim)

	# Reshape back
	context = np.transpose(context, (0, 2, 1, 3)) # (batch_size, q_len, num_heads, head_dim)
	context = context.reshape(batch_size, q_len, self.d_model)

	# Final linear layer
	output = np.dot(context, self.W_out) + self.b_out

	return output

	def _softmax(self, x):
	"""Compute softmax along the last dimension"""
	# Shift for numerical stability
	e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
	return e_x / np.sum(e_x, axis=-1, keepdims=True)


	class PositionwiseFeedForward:
	def __init__(self, d_model, d_ff, dropout_rate=0.0):
	self.d_model = d_model
	self.d_ff = d_ff
	self.dropout_rate = dropout_rate

	# Weights for the two layers
	scale1 = np.sqrt(2.0 / (d_model + d_ff))
	scale2 = np.sqrt(2.0 / (d_ff + d_model))
	self.W1 = np.random.normal(0, scale1, (d_model, d_ff))
	self.b1 = np.zeros(d_ff)
	self.W2 = np.random.normal(0, scale2, (d_ff, d_model))
	self.b2 = np.zeros(d_model)

	def forward(self, x):
	# First dense layer with GELU activation
	hidden = np.dot(x, self.W1) + self.b1
	hidden = self.gelu(hidden)

	# Second dense layer
	output = np.dot(hidden, self.W2) + self.b2

	# Dropout
	if self.dropout_rate > 0:
	dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, output.shape)
	output = output * dropout_mask / (1 - self.dropout_rate)

	return output

	def gelu(self, x):
	"""Gaussian Error Linear Unit activation"""
	return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))


	class EncoderLayer:
	def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
	# Multi-head attention
	self.mha = MultiHeadAttention(d_model, num_heads, dropout_rate)

	# Position-wise feed-forward network
	self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout_rate)

	# Layer normalization
	self.norm1 = LayerNorm(d_model)
	self.norm2 = LayerNorm(d_model)

	# Dropout
	self.dropout_rate = dropout_rate

	def forward(self, x, mask=None):
	# Pre-LayerNorm architecture
	# Multi-head attention block
	norm_x = self.norm1.forward(x)
	attention_output = self.mha.forward(norm_x, norm_x, norm_x, mask)

	# Apply dropout and residual connection
	if self.dropout_rate > 0:
	dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, attention_output.shape)
	attention_output = attention_output * dropout_mask / (1 - self.dropout_rate)
	x = x + attention_output

	# Feed-forward block
	norm_x = self.norm2.forward(x)
	ffn_output = self.ffn.forward(norm_x)

	# Apply dropout and residual connection
	if self.dropout_rate > 0:
	dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, ffn_output.shape)
	ffn_output = ffn_output * dropout_mask / (1 - self.dropout_rate)
	x = x + ffn_output

	return x


	class PatchEmbedding:
	"""
	Convert images into patches and project to embedding dimension.
	This is the first layer of Vision Transformer.
	"""
	def __init__(self, img_size, patch_size, in_channels, embed_dim):
	"""
	Args:
	img_size: Size of the input image (H, W)
	patch_size: Size of each patch (P, P)
	in_channels: Number of input channels (C)
	embed_dim: Dimension of the token embeddings (D)
	"""
	self.img_size = img_size
	self.patch_size = patch_size
	self.in_channels = in_channels
	self.embed_dim = embed_dim

	# Number of patches
	self.num_patches = (img_size[0] // patch_size) * (img_size[1] // patch_size)

	# Projection layer: transforms patches into embedding dimension
	# Equivalent to a Conv2D with kernel size = patch size and stride = patch size
	scale = np.sqrt(2.0 / (patch_size * patch_size * in_channels + embed_dim))
	self.proj_weight = np.random.normal(0, scale, (embed_dim, in_channels, patch_size, patch_size))
	self.proj_bias = np.zeros(embed_dim)

	def forward(self, x):
	"""
	Args:
	x: Input images [batch_size, in_channels, height, width]

	Returns:
	Patch embeddings [batch_size, num_patches, embed_dim]
	"""
	batch_size, _, height, width = x.shape

	# Check image size
	assert height == self.img_size[0] and width == self.img_size[1], \
	f"Input image size ({height}{width}) doesn't match model ({self.img_size[0]}{self.img_size[1]})"

	# Extract patches from image
	patches = self.extract_patches(x) # [batch_size, num_patches, in_channelspatch_sizepatch_size]

	# Project patches to embedding dimension
	patch_embeddings = self.project_patches(patches) # [batch_size, num_patches, embed_dim]

	return patch_embeddings

	def extract_patches(self, x):
	"""Extract patches from the input images"""
	batch_size, channels, height, width = x.shape
	patch_size = self.patch_size

	# Calculate the number of patches in each dimension
	num_patches_h = height // patch_size
	num_patches_w = width // patch_size

	# Extract patches
	patches = []
	for i in range(num_patches_h):
	for j in range(num_patches_w):
	h_start, h_end = i * patch_size, (i + 1) * patch_size
	w_start, w_end = j * patch_size, (j + 1) * patch_size

	# Extract patch [batch_size, channels, patch_size, patch_size]
	patch = x[:, :, h_start:h_end, w_start:w_end]

	# Flatten the patch [batch_size, channelspatch_sizepatch_size]
	patch_flat = patch.reshape(batch_size, -1)

	patches.append(patch_flat)

	# Stack patches [batch_size, num_patches, channelspatch_sizepatch_size]
	patches = np.stack(patches, axis=1)

	return patches

	def project_patches(self, patches):
	"""Project patches to embedding dimension"""
	batch_size, num_patches, patch_dim = patches.shape

	# Reshape projection weight for matrix multiplication
	proj_weight_reshaped = self.proj_weight.reshape(self.embed_dim, -1)

	# Project patches [batch_size, num_patches, embed_dim]
	patch_embeddings = np.zeros((batch_size, num_patches, self.embed_dim))

	for b in range(batch_size):
	# [num_patches, patch_dim] @ [patch_dim, embed_dim] = [num_patches, embed_dim]
	patch_embeddings[b] = np.dot(patches[b], proj_weight_reshaped.T) + self.proj_bias

	return patch_embeddings


	class VisionTransformer:
	"""
	Vision Transformer (ViT) model.
	"""
	def __init__(
	self,
	img_size=(224, 224),
	patch_size=16,
	in_channels=3,
	num_classes=1000,
	embed_dim=768,
	depth=12,
	num_heads=12,
	mlp_ratio=4.0,
	dropout_rate=0.1,
	attn_dropout_rate=0.0
	):
	"""
	Args:
	img_size: Size of the input image (height, width)
	patch_size: Size of each patch
	in_channels: Number of input channels
	num_classes: Number of classes for classification
	embed_dim: Dimension of the token embeddings
	depth: Number of transformer blocks
	num_heads: Number of attention heads
	mlp_ratio: Ratio of mlp hidden dim to embedding dim
	dropout_rate: Dropout rate
	attn_dropout_rate: Attention dropout rate
	"""
	self.img_size = img_size
	self.patch_size = patch_size
	self.in_channels = in_channels
	self.num_classes = num_classes
	self.embed_dim = embed_dim
	self.depth = depth
	self.num_heads = num_heads
	self.mlp_dim = int(embed_dim * mlp_ratio)
	self.dropout_rate = dropout_rate
	self.attn_dropout_rate = attn_dropout_rate

	# Patch embedding layer
	self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
	num_patches = self.patch_embed.num_patches

	# Class token (learnable)
	self.cls_token = np.random.normal(0, 0.02, (1, 1, embed_dim))

	# Position embeddings (learnable)
	# +1 for the class token
	self.pos_embedding = np.random.normal(0, 0.02, (1, num_patches + 1, embed_dim))

	# Dropout
	self.dropout_rate = dropout_rate

	# Transformer encoder layers
	self.encoder_layers = []
	for _ in range(depth):
	layer = EncoderLayer(
	d_model=embed_dim,
	num_heads=num_heads,
	d_ff=self.mlp_dim,
	dropout_rate=dropout_rate
	)
	self.encoder_layers.append(layer)

	# Layer norm
	self.norm = LayerNorm(embed_dim)

	# Classification head
	scale = np.sqrt(2.0 / (embed_dim + num_classes))
	self.head_weight = np.random.normal(0, scale, (embed_dim, num_classes))
	self.head_bias = np.zeros(num_classes)

	def forward(self, x):
	"""
	Args:
	x: Input images [batch_size, in_channels, height, width]

	Returns:
	Classification logits [batch_size, num_classes]
	"""
	batch_size = x.shape[0]

	# Create patch embeddings
	x = self.patch_embed.forward(x) # [batch_size, num_patches, embed_dim]

	# Add class token to beginning of sequence
	cls_tokens = np.repeat(self.cls_token, batch_size, axis=0) # [batch_size, 1, embed_dim]
	x = np.concatenate([cls_tokens, x], axis=1) # [batch_size, 1 + num_patches, embed_dim]

	# Add position embeddings
	x = x + self.pos_embedding # [batch_size, 1 + num_patches, embed_dim]

	# Apply dropout
	if self.dropout_rate > 0:
	dropout_mask = np.random.binomial(1, 1 - self.dropout_rate, x.shape)
	x = x * dropout_mask / (1 - self.dropout_rate)

	# Apply transformer encoder
	for layer in self.encoder_layers:
	x = layer.forward(x)

	# Layer normalization
	x = self.norm.forward(x)

	# Take the output of the class token for classification
	x = x[:, 0] # [batch_size, embed_dim]

	# Classification head
	logits = np.dot(x, self.head_weight) + self.head_bias # [batch_size, num_classes]

	return logits


	def create_sample_image_batch(batch_size=1, img_size=(224, 224), channels=3):
	"""Create a sample batch of images for testing"""
	return np.random.random((batch_size, channels, img_size[0], img_size[1]))


	def test_vision_transformer():
	"""Test the Vision Transformer implementation with a sample input"""
	# Create a small ViT model for testing
	model = VisionTransformer(
	img_size=(224, 224),
	patch_size=16,
	in_channels=3,
	num_classes=10,
	embed_dim=192,
	depth=4,
	num_heads=3,
	mlp_ratio=4.0,
	dropout_rate=0.1
	)

	# Create a sample batch of images
	batch_size = 2
	images = create_sample_image_batch(batch_size, (224, 224), 3)

	# Forward pass
	logits = model.forward(images)

	print(f"Input shape: {images.shape}")
	print(f"Output shape: {logits.shape}")
	print(f"Expected output shape: (batch_size, num_classes) = ({batch_size}, {model.num_classes})")

	return logits


	if __name__ == "__main__":
	test_vision_transformer()
No results found