raymondtay · February 22, 2026 03:25 · Feb 22, 2026 · Feb 22, 2026
diff --git a/Makefile b/Makefile
@@ -0,0 +1,87 @@
+# Makefile for Gaussian Blur CUDA implementations
+# Supports both standalone CUDA and C++ with CUDA versions
+
+# Compiler settings
+NVCC = nvcc
+CXX = g++
+
+# CUDA architecture (adjust based on your GPU)
+# Common options:
+# - sm_50: Maxwell (GTX 900 series)
+# - sm_60: Pascal (GTX 10 series)
+# - sm_70: Volta (Tesla V100)
+# - sm_75: Turing (RTX 20 series)
+# - sm_80: Ampere (RTX 30 series, A100)
+# - sm_86: Ampere (RTX 30 series mobile)
+# - sm_89: Ada Lovelace (RTX 40 series)
+CUDA_ARCH = sm_75
+
+# Compiler flags
+NVCC_FLAGS = -arch=$(CUDA_ARCH) -O3 --use_fast_math -Xcompiler -Wall
+CXX_FLAGS = -std=c++11 -Wall -O3
+
+# CUDA include and library paths (adjust if needed)
+CUDA_INC = /usr/local/cuda/include
+CUDA_LIB = /usr/local/cuda/lib64
+
+# Targets
+TARGETS = gaussian_blur_cuda gaussian_blur_cpp
+
+.PHONY: all clean
+
+all: $(TARGETS)
+
+# Standalone CUDA implementation
+gaussian_blur_cuda: gaussian_blur_cuda.cu
+	$(NVCC) $(NVCC_FLAGS) $< -o $@
+	@echo "Built standalone CUDA implementation: $@"
+
+# C++ with CUDA implementation (requires separate compilation)
+gaussian_blur_kernels.o: gaussian_blur_kernels.cu
+	$(NVCC) $(NVCC_FLAGS) -c $< -o $@
+
+main.o: main.cpp gaussian_blur.hpp
+	$(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@
+
+gaussian_blur_cpp: main.o gaussian_blur_kernels.o
+	$(NVCC) $(NVCC_FLAGS) $^ -o $@
+	@echo "Built C++ with CUDA implementation: $@"
+
+# Run targets
+run_cuda: gaussian_blur_cuda
+	@echo "\n=== Running Standalone CUDA Implementation ==="
+	./gaussian_blur_cuda
+
+run_cpp: gaussian_blur_cpp
+	@echo "\n=== Running C++ with CUDA Implementation ==="
+	./gaussian_blur_cpp
+
+run: run_cuda run_cpp
+
+# Clean build artifacts
+clean:
+	rm -f $(TARGETS) *.o
+	@echo "Cleaned build artifacts"
+
+# Help target
+help:
+	@echo "Gaussian Blur CUDA Makefile"
+	@echo ""
+	@echo "Targets:"
+	@echo "  all              - Build all implementations (default)"
+	@echo "  gaussian_blur_cuda - Build standalone CUDA version"
+	@echo "  gaussian_blur_cpp  - Build C++ with CUDA version"
+	@echo "  run_cuda         - Build and run standalone CUDA"
+	@echo "  run_cpp          - Build and run C++ with CUDA"
+	@echo "  run              - Run both implementations"
+	@echo "  clean            - Remove build artifacts"
+	@echo "  help             - Show this help message"
+	@echo ""
+	@echo "Configuration:"
+	@echo "  CUDA_ARCH=$(CUDA_ARCH) - Target GPU architecture"
+	@echo ""
+	@echo "Usage examples:"
+	@echo "  make                    # Build all"
+	@echo "  make run                # Build and run both"
+	@echo "  make CUDA_ARCH=sm_80    # Build for Ampere (RTX 30 series)"
+	@echo "  make clean              # Clean build files"
diff --git a/gaussian_blur.hpp b/gaussian_blur.hpp
@@ -0,0 +1,250 @@
+/*
+ * Separable Gaussian Blur - C++ with CUDA Implementation
+ * 
+ * This is a C++ wrapper around CUDA kernels providing:
+ * - Object-oriented interface
+ * - RAII memory management
+ * - Support for multiple data types (float, uchar)
+ * - Easy-to-use API
+ */
+
+#ifndef GAUSSIAN_BLUR_HPP
+#define GAUSSIAN_BLUR_HPP
+
+#include <cuda_runtime.h>
+#include <vector>
+#include <memory>
+#include <stdexcept>
+#include <cmath>
+#include <iostream>
+
+// CUDA kernel declarations
+extern "C" {
+    void launchGaussianBlurHorizontal(
+        const float* input,
+        float* output,
+        const float* kernel,
+        int width,
+        int height,
+        int radius,
+        dim3 gridSize,
+        dim3 blockSize,
+        int sharedMemSize
+    );
+
+    void launchGaussianBlurVertical(
+        const float* input,
+        float* output,
+        const float* kernel,
+        int width,
+        int height,
+        int radius,
+        dim3 gridSize,
+        dim3 blockSize,
+        int sharedMemSize
+    );
+}
+
+namespace gpu {
+
+// RAII wrapper for CUDA device memory
+template<typename T>
+class DeviceBuffer {
+private:
+    T* d_ptr_;
+    size_t size_;
+
+public:
+    DeviceBuffer(size_t size) : size_(size) {
+        cudaError_t err = cudaMalloc(&d_ptr_, size * sizeof(T));
+        if (err != cudaSuccess) {
+            throw std::runtime_error("CUDA malloc failed: " + 
+                std::string(cudaGetErrorString(err)));
+        }
+    }
+
+    ~DeviceBuffer() {
+        if (d_ptr_) {
+            cudaFree(d_ptr_);
+        }
+    }
+
+    // Delete copy constructor and assignment
+    DeviceBuffer(const DeviceBuffer&) = delete;
+    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
+
+    // Move constructor and assignment
+    DeviceBuffer(DeviceBuffer&& other) noexcept 
+        : d_ptr_(other.d_ptr_), size_(other.size_) {
+        other.d_ptr_ = nullptr;
+    }
+
+    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
+        if (this != &other) {
+            if (d_ptr_) cudaFree(d_ptr_);
+            d_ptr_ = other.d_ptr_;
+            size_ = other.size_;
+            other.d_ptr_ = nullptr;
+        }
+        return *this;
+    }
+
+    T* get() { return d_ptr_; }
+    const T* get() const { return d_ptr_; }
+    size_t size() const { return size_; }
+
+    void copyToDevice(const T* host_data) {
+        cudaError_t err = cudaMemcpy(d_ptr_, host_data, 
+            size_ * sizeof(T), cudaMemcpyHostToDevice);
+        if (err != cudaSuccess) {
+            throw std::runtime_error("Copy to device failed");
+        }
+    }
+
+    void copyToHost(T* host_data) const {
+        cudaError_t err = cudaMemcpy(host_data, d_ptr_, 
+            size_ * sizeof(T), cudaMemcpyDeviceToHost);
+        if (err != cudaSuccess) {
+            throw std::runtime_error("Copy to host failed");
+        }
+    }
+};
+
+// Gaussian blur class
+class GaussianBlur {
+private:
+    int width_;
+    int height_;
+    int radius_;
+    float sigma_;
+    std::vector<float> kernel_;
+
+    std::unique_ptr<DeviceBuffer<float>> d_kernel_;
+    std::unique_ptr<DeviceBuffer<float>> d_temp_;
+
+    void generateKernel() {
+        int kernelSize = 2 * radius_ + 1;
+        kernel_.resize(kernelSize);
+
+        float sum = 0.0f;
+        for (int i = -radius_; i <= radius_; i++) {
+            float value = std::exp(-(i * i) / (2.0f * sigma_ * sigma_));
+            kernel_[i + radius_] = value;
+            sum += value;
+        }
+
+        // Normalize
+        for (auto& val : kernel_) {
+            val /= sum;
+        }
+    }
+
+public:
+    GaussianBlur(int width, int height, int radius, float sigma)
+        : width_(width), height_(height), radius_(radius), sigma_(sigma) {
+
+        if (radius <= 0 || radius > 50) {
+            throw std::invalid_argument("Radius must be between 1 and 50");
+        }
+
+        if (sigma <= 0.0f) {
+            throw std::invalid_argument("Sigma must be positive");
+        }
+
+        // Generate Gaussian kernel
+        generateKernel();
+
+        // Allocate device memory
+        d_kernel_ = std::make_unique<DeviceBuffer<float>>(kernel_.size());
+        d_kernel_->copyToDevice(kernel_.data());
+
+        // Allocate temporary buffer for intermediate results
+        d_temp_ = std::make_unique<DeviceBuffer<float>>(width * height);
+    }
+
+    // Apply Gaussian blur
+    void apply(const float* h_input, float* h_output) {
+        // Allocate device buffers
+        DeviceBuffer<float> d_input(width_ * height_);
+        DeviceBuffer<float> d_output(width_ * height_);
+
+        // Copy input to device
+        d_input.copyToDevice(h_input);
+
+        // Configure kernel launch parameters
+        const int blockSize = 16;
+        dim3 blockDim(blockSize, blockSize);
+        dim3 gridDim(
+            (width_ + blockSize - 1) / blockSize,
+            (height_ + blockSize - 1) / blockSize
+        );
+
+        int sharedMemSize = (blockSize + 2 * radius_) * sizeof(float);
+
+        // Launch horizontal blur
+        launchGaussianBlurHorizontal(
+            d_input.get(),
+            d_temp_->get(),
+            d_kernel_->get(),
+            width_,
+            height_,
+            radius_,
+            gridDim,
+            blockDim,
+            sharedMemSize
+        );
+
+        // Check for kernel launch errors
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            throw std::runtime_error("Horizontal kernel launch failed: " + 
+                std::string(cudaGetErrorString(err)));
+        }
+
+        // Launch vertical blur
+        launchGaussianBlurVertical(
+            d_temp_->get(),
+            d_output.get(),
+            d_kernel_->get(),
+            width_,
+            height_,
+            radius_,
+            gridDim,
+            blockDim,
+            sharedMemSize
+        );
+
+        err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            throw std::runtime_error("Vertical kernel launch failed: " + 
+                std::string(cudaGetErrorString(err)));
+        }
+
+        // Synchronize
+        cudaDeviceSynchronize();
+
+        // Copy result back to host
+        d_output.copyToHost(h_output);
+    }
+
+    // Getters
+    int getWidth() const { return width_; }
+    int getHeight() const { return height_; }
+    int getRadius() const { return radius_; }
+    float getSigma() const { return sigma_; }
+    const std::vector<float>& getKernel() const { return kernel_; }
+
+    // Print kernel coefficients
+    void printKernel() const {
+        std::cout << "Gaussian kernel (radius=" << radius_ 
+                  << ", sigma=" << sigma_ << "):\n";
+        for (size_t i = 0; i < kernel_.size(); i++) {
+            std::cout << kernel_[i] << " ";
+        }
+        std::cout << std::endl;
+    }
+};
+
+} // namespace gpu
+
+#endif // GAUSSIAN_BLUR_HPP
diff --git a/gaussian_blur_cuda.cu b/gaussian_blur_cuda.cu
@@ -0,0 +1,304 @@
+/*
+ * Separable Gaussian Blur - Pure CUDA Implementation
+ * 
+ * This implementation uses:
+ * - Shared memory for efficient data reuse
+ * - Separate kernels for horizontal and vertical passes
+ * - Optimized memory access patterns
+ */
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <stdio.h>
+#include <math.h>
+
+#define KERNEL_RADIUS 5
+#define KERNEL_SIZE (2 * KERNEL_RADIUS + 1)
+#define BLOCK_SIZE 16
+
+// Error checking macro
+#define CUDA_CHECK(call) \
+    do { \
+        cudaError_t error = call; \
+        if (error != cudaSuccess) { \
+            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
+                    cudaGetErrorString(error)); \
+            exit(EXIT_FAILURE); \
+        } \
+    } while(0)
+
+/*
+ * Generate 1D Gaussian kernel
+ * Formula: G(x) = (1/sqrt(2*pi*sigma^2)) * exp(-x^2 / (2*sigma^2))
+ */
+__host__ void generateGaussianKernel(float* kernel, int radius, float sigma) {
+    float sum = 0.0f;
+
+    for (int i = -radius; i <= radius; i++) {
+        float value = expf(-(i * i) / (2.0f * sigma * sigma));
+        kernel[i + radius] = value;
+        sum += value;
+    }
+
+    // Normalize
+    for (int i = 0; i < 2 * radius + 1; i++) {
+        kernel[i] /= sum;
+    }
+}
+
+/*
+ * Horizontal Gaussian blur kernel
+ * Each thread processes one pixel, reading from shared memory
+ */
+__global__ void gaussianBlurHorizontal(
+    const float* input,
+    float* output,
+    const float* kernel,
+    int width,
+    int height,
+    int radius
+) {
+    // Shared memory for the row (includes halo region)
+    extern __shared__ float sharedRow[];
+
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (y >= height) return;
+
+    // Load data into shared memory with halo
+    int sharedIdx = threadIdx.x + radius;
+
+    // Load main data
+    if (x < width) {
+        sharedRow[sharedIdx] = input[y * width + x];
+    }
+
+    // Load left halo
+    if (threadIdx.x < radius) {
+        int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
+        if (leftX < 0) {
+            sharedRow[threadIdx.x] = input[y * width + 0]; // Clamp
+        } else {
+            sharedRow[threadIdx.x] = input[y * width + leftX];
+        }
+    }
+
+    // Load right halo
+    if (threadIdx.x >= blockDim.x - radius) {
+        int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
+        int sharedRightIdx = threadIdx.x + 2 * radius;
+        if (rightX >= width) {
+            sharedRow[sharedRightIdx] = input[y * width + width - 1]; // Clamp
+        } else {
+            sharedRow[sharedRightIdx] = input[y * width + rightX];
+        }
+    }
+
+    __syncthreads();
+
+    // Perform convolution
+    if (x < width) {
+        float sum = 0.0f;
+
+        for (int k = -radius; k <= radius; k++) {
+            sum += sharedRow[sharedIdx + k] * kernel[k + radius];
+        }
+
+        output[y * width + x] = sum;
+    }
+}
+
+/*
+ * Vertical Gaussian blur kernel
+ * Each thread processes one pixel, reading columns
+ */
+__global__ void gaussianBlurVertical(
+    const float* input,
+    float* output,
+    const float* kernel,
+    int width,
+    int height,
+    int radius
+) {
+    // Shared memory for the column (includes halo region)
+    extern __shared__ float sharedCol[];
+
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width) return;
+
+    // Load data into shared memory with halo
+    int sharedIdx = threadIdx.y + radius;
+
+    // Load main data
+    if (y < height) {
+        sharedCol[sharedIdx] = input[y * width + x];
+    }
+
+    // Load top halo
+    if (threadIdx.y < radius) {
+        int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
+        if (topY < 0) {
+            sharedCol[threadIdx.y] = input[0 * width + x]; // Clamp
+        } else {
+            sharedCol[threadIdx.y] = input[topY * width + x];
+        }
+    }
+
+    // Load bottom halo
+    if (threadIdx.y >= blockDim.y - radius) {
+        int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
+        int sharedBottomIdx = threadIdx.y + 2 * radius;
+        if (bottomY >= height) {
+            sharedCol[sharedBottomIdx] = input[(height - 1) * width + x]; // Clamp
+        } else {
+            sharedCol[sharedBottomIdx] = input[bottomY * width + x];
+        }
+    }
+
+    __syncthreads();
+
+    // Perform convolution
+    if (y < height) {
+        float sum = 0.0f;
+
+        for (int k = -radius; k <= radius; k++) {
+            sum += sharedCol[sharedIdx + k] * kernel[k + radius];
+        }
+
+        output[y * width + x] = sum;
+    }
+}
+
+/*
+ * Apply separable Gaussian blur
+ */
+void applySeparableGaussianBlur(
+    const float* d_input,
+    float* d_output,
+    float* d_temp,
+    const float* d_kernel,
+    int width,
+    int height,
+    int radius
+) {
+    dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
+    dim3 gridSize(
+        (width + blockSize.x - 1) / blockSize.x,
+        (height + blockSize.y - 1) / blockSize.y
+    );
+
+    // Shared memory size (block size + 2 * radius for halo)
+    int sharedMemSize = (BLOCK_SIZE + 2 * radius) * sizeof(float);
+
+    // Horizontal pass
+    gaussianBlurHorizontal<<<gridSize, blockSize, sharedMemSize>>>(
+        d_input, d_temp, d_kernel, width, height, radius
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    // Vertical pass
+    gaussianBlurVertical<<<gridSize, blockSize, sharedMemSize>>>(
+        d_temp, d_output, d_kernel, width, height, radius
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+/*
+ * Main function demonstrating usage
+ */
+int main() {
+    // Image dimensions
+    const int width = 1920;
+    const int height = 1080;
+    const int imageSize = width * height * sizeof(float);
+
+    // Gaussian parameters
+    const int radius = KERNEL_RADIUS;
+    const float sigma = 2.0f;
+    const int kernelSize = 2 * radius + 1;
+
+    printf("Separable Gaussian Blur (CUDA)\n");
+    printf("Image size: %d x %d\n", width, height);
+    printf("Kernel radius: %d (size: %d)\n", radius, kernelSize);
+    printf("Sigma: %.2f\n\n", sigma);
+
+    // Allocate host memory
+    float* h_input = (float*)malloc(imageSize);
+    float* h_output = (float*)malloc(imageSize);
+    float* h_kernel = (float*)malloc(kernelSize * sizeof(float));
+
+    // Initialize input with test pattern
+    for (int i = 0; i < width * height; i++) {
+        h_input[i] = (float)(rand() % 256) / 255.0f;
+    }
+
+    // Generate Gaussian kernel
+    generateGaussianKernel(h_kernel, radius, sigma);
+
+    printf("Gaussian kernel coefficients:\n");
+    for (int i = 0; i < kernelSize; i++) {
+        printf("%.6f ", h_kernel[i]);
+    }
+    printf("\n\n");
+
+    // Allocate device memory
+    float *d_input, *d_output, *d_temp, *d_kernel;
+    CUDA_CHECK(cudaMalloc(&d_input, imageSize));
+    CUDA_CHECK(cudaMalloc(&d_output, imageSize));
+    CUDA_CHECK(cudaMalloc(&d_temp, imageSize));
+    CUDA_CHECK(cudaMalloc(&d_kernel, kernelSize * sizeof(float)));
+
+    // Copy data to device
+    CUDA_CHECK(cudaMemcpy(d_input, h_input, imageSize, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice));
+
+    // Create CUDA events for timing
+    cudaEvent_t start, stop;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&stop));
+
+    // Warm-up run
+    applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);
+
+    // Timed run
+    CUDA_CHECK(cudaEventRecord(start));
+    applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);
+    CUDA_CHECK(cudaEventRecord(stop));
+    CUDA_CHECK(cudaEventSynchronize(stop));
+
+    float milliseconds = 0;
+    CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
+
+    printf("Execution time: %.3f ms\n", milliseconds);
+    printf("Throughput: %.2f Mpixels/sec\n", (width * height / 1e6) / (milliseconds / 1000.0f));
+
+    // Copy result back to host
+    CUDA_CHECK(cudaMemcpy(h_output, d_output, imageSize, cudaMemcpyDeviceToHost));
+
+    // Verify result (check a few pixels)
+    printf("\nSample output values:\n");
+    for (int i = 0; i < 5; i++) {
+        int idx = (height / 2) * width + (width / 2) + i;
+        printf("Pixel[%d][%d]: %.6f\n", height / 2, width / 2 + i, h_output[idx]);
+    }
+
+    // Cleanup
+    CUDA_CHECK(cudaEventDestroy(start));
+    CUDA_CHECK(cudaEventDestroy(stop));
+    CUDA_CHECK(cudaFree(d_input));
+    CUDA_CHECK(cudaFree(d_output));
+    CUDA_CHECK(cudaFree(d_temp));
+    CUDA_CHECK(cudaFree(d_kernel));
+    free(h_input);
+    free(h_output);
+    free(h_kernel);
+
+    printf("\nGaussian blur completed successfully!\n");
+
+    return 0;
+}
diff --git a/gaussian_blur_kernels.cu b/gaussian_blur_kernels.cu
@@ -0,0 +1,156 @@
+/*
+ * CUDA Kernels Implementation
+ * Separable Gaussian Blur
+ */
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+
+/*
+ * Horizontal Gaussian blur kernel with shared memory optimization
+ */
+__global__ void gaussianBlurHorizontalKernel(
+    const float* __restrict__ input,
+    float* __restrict__ output,
+    const float* __restrict__ kernel,
+    int width,
+    int height,
+    int radius
+) {
+    extern __shared__ float sharedRow[];
+
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (y >= height) return;
+
+    int sharedIdx = threadIdx.x + radius;
+
+    // Load center data
+    if (x < width) {
+        sharedRow[sharedIdx] = input[y * width + x];
+    }
+
+    // Load left halo
+    if (threadIdx.x < radius) {
+        int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
+        sharedRow[threadIdx.x] = (leftX < 0) ? 
+            input[y * width] : input[y * width + leftX];
+    }
+
+    // Load right halo
+    if (threadIdx.x >= blockDim.x - radius) {
+        int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
+        int sharedRightIdx = threadIdx.x + 2 * radius;
+        sharedRow[sharedRightIdx] = (rightX >= width) ? 
+            input[y * width + width - 1] : input[y * width + rightX];
+    }
+
+    __syncthreads();
+
+    // Perform convolution
+    if (x < width) {
+        float sum = 0.0f;
+
+        #pragma unroll
+        for (int k = -radius; k <= radius; k++) {
+            sum += sharedRow[sharedIdx + k] * kernel[k + radius];
+        }
+
+        output[y * width + x] = sum;
+    }
+}
+
+/*
+ * Vertical Gaussian blur kernel with shared memory optimization
+ */
+__global__ void gaussianBlurVerticalKernel(
+    const float* __restrict__ input,
+    float* __restrict__ output,
+    const float* __restrict__ kernel,
+    int width,
+    int height,
+    int radius
+) {
+    extern __shared__ float sharedCol[];
+
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width) return;
+
+    int sharedIdx = threadIdx.y + radius;
+
+    // Load center data
+    if (y < height) {
+        sharedCol[sharedIdx] = input[y * width + x];
+    }
+
+    // Load top halo
+    if (threadIdx.y < radius) {
+        int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
+        sharedCol[threadIdx.y] = (topY < 0) ? 
+            input[x] : input[topY * width + x];
+    }
+
+    // Load bottom halo
+    if (threadIdx.y >= blockDim.y - radius) {
+        int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
+        int sharedBottomIdx = threadIdx.y + 2 * radius;
+        sharedCol[sharedBottomIdx] = (bottomY >= height) ? 
+            input[(height - 1) * width + x] : input[bottomY * width + x];
+    }
+
+    __syncthreads();
+
+    // Perform convolution
+    if (y < height) {
+        float sum = 0.0f;
+
+        #pragma unroll
+        for (int k = -radius; k <= radius; k++) {
+            sum += sharedCol[sharedIdx + k] * kernel[k + radius];
+        }
+
+        output[y * width + x] = sum;
+    }
+}
+
+/*
+ * C-style wrapper functions for C++ code
+ */
+extern "C" {
+
+void launchGaussianBlurHorizontal(
+    const float* input,
+    float* output,
+    const float* kernel,
+    int width,
+    int height,
+    int radius,
+    dim3 gridSize,
+    dim3 blockSize,
+    int sharedMemSize
+) {
+    gaussianBlurHorizontalKernel<<<gridSize, blockSize, sharedMemSize>>>(
+        input, output, kernel, width, height, radius
+    );
+}
+
+void launchGaussianBlurVertical(
+    const float* input,
+    float* output,
+    const float* kernel,
+    int width,
+    int height,
+    int radius,
+    dim3 gridSize,
+    dim3 blockSize,
+    int sharedMemSize
+) {
+    gaussianBlurVerticalKernel<<<gridSize, blockSize, sharedMemSize>>>(
+        input, output, kernel, width, height, radius
+    );
+}
+
+} // extern "C"
diff --git a/main.cpp b/main.cpp
@@ -0,0 +1,205 @@
+/*
+ * Main Application - C++ with CUDA Gaussian Blur
+ * 
+ * Demonstrates usage of the GaussianBlur class
+ */
+
+#include "gaussian_blur.hpp"
+#include <iostream>
+#include <vector>
+#include <chrono>
+#include <random>
+#include <iomanip>
+
+// Simple image loader/saver simulation
+class Image {
+private:
+    int width_;
+    int height_;
+    std::vector<float> data_;
+
+public:
+    Image(int width, int height) 
+        : width_(width), height_(height), data_(width * height) {}
+
+    void fillRandom() {
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_real_distribution<float> dis(0.0f, 1.0f);
+
+        for (auto& pixel : data_) {
+            pixel = dis(gen);
+        }
+    }
+
+    void fillTestPattern() {
+        for (int y = 0; y < height_; y++) {
+            for (int x = 0; x < width_; x++) {
+                // Create a checkerboard pattern
+                float value = ((x / 32) % 2) ^ ((y / 32) % 2) ? 1.0f : 0.0f;
+                data_[y * width_ + x] = value;
+            }
+        }
+    }
+
+    float* data() { return data_.data(); }
+    const float* data() const { return data_.data(); }
+    int width() const { return width_; }
+    int height() const { return height_; }
+    size_t size() const { return data_.size(); }
+
+    float getPixel(int x, int y) const {
+        if (x < 0 || x >= width_ || y < 0 || y >= height_) {
+            return 0.0f;
+        }
+        return data_[y * width_ + x];
+    }
+
+    void printRegion(int startX, int startY, int sizeX, int sizeY) const {
+        std::cout << std::fixed << std::setprecision(3);
+        for (int y = startY; y < startY + sizeY && y < height_; y++) {
+            for (int x = startX; x < startX + sizeX && x < width_; x++) {
+                std::cout << getPixel(x, y) << " ";
+            }
+            std::cout << std::endl;
+        }
+    }
+};
+
+// Benchmark helper
+class Timer {
+private:
+    std::chrono::high_resolution_clock::time_point start_;
+
+public:
+    void start() {
+        start_ = std::chrono::high_resolution_clock::now();
+    }
+
+    double elapsed() const {
+        auto end = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double, std::milli> duration = end - start_;
+        return duration.count();
+    }
+};
+
+// Run benchmark
+void runBenchmark(gpu::GaussianBlur& blur, const Image& input, 
+                  Image& output, int iterations = 10) {
+    Timer timer;
+    std::vector<double> times;
+
+    // Warm-up run
+    blur.apply(input.data(), output.data());
+
+    // Benchmark runs
+    for (int i = 0; i < iterations; i++) {
+        timer.start();
+        blur.apply(input.data(), output.data());
+        times.push_back(timer.elapsed());
+    }
+
+    // Calculate statistics
+    double sum = 0.0;
+    double minTime = times[0];
+    double maxTime = times[0];
+
+    for (double t : times) {
+        sum += t;
+        minTime = std::min(minTime, t);
+        maxTime = std::max(maxTime, t);
+    }
+
+    double avgTime = sum / iterations;
+    int pixels = input.width() * input.height();
+    double throughput = (pixels / 1e6) / (avgTime / 1000.0);
+
+    std::cout << "\n=== Benchmark Results ===" << std::endl;
+    std::cout << "Iterations: " << iterations << std::endl;
+    std::cout << "Average time: " << avgTime << " ms" << std::endl;
+    std::cout << "Min time: " << minTime << " ms" << std::endl;
+    std::cout << "Max time: " << maxTime << " ms" << std::endl;
+    std::cout << "Throughput: " << throughput << " Mpixels/sec" << std::endl;
+}
+
+int main(int argc, char** argv) {
+    std::cout << "=== Separable Gaussian Blur (C++ with CUDA) ===" << std::endl;
+    std::cout << std::endl;
+
+    // Configuration
+    const int width = 1920;
+    const int height = 1080;
+    const int radius = 5;
+    const float sigma = 2.0f;
+
+    std::cout << "Image dimensions: " << width << " x " << height << std::endl;
+    std::cout << "Kernel radius: " << radius << std::endl;
+    std::cout << "Sigma: " << sigma << std::endl;
+    std::cout << std::endl;
+
+    try {
+        // Create input and output images
+        Image input(width, height);
+        Image output(width, height);
+
+        // Fill with test data
+        std::cout << "Generating test image..." << std::endl;
+        input.fillRandom();
+
+        // Create Gaussian blur processor
+        std::cout << "Initializing GPU processor..." << std::endl;
+        gpu::GaussianBlur blur(width, height, radius, sigma);
+
+        // Print kernel
+        blur.printKernel();
+        std::cout << std::endl;
+
+        // Apply blur
+        std::cout << "Applying Gaussian blur..." << std::endl;
+        Timer timer;
+        timer.start();
+        blur.apply(input.data(), output.data());
+        double elapsed = timer.elapsed();
+
+        std::cout << "First run completed in " << elapsed << " ms" << std::endl;
+
+        // Show sample results
+        std::cout << "\nSample input region (center 5x5):" << std::endl;
+        input.printRegion(width/2 - 2, height/2 - 2, 5, 5);
+
+        std::cout << "\nSample output region (center 5x5):" << std::endl;
+        output.printRegion(width/2 - 2, height/2 - 2, 5, 5);
+
+        // Run benchmark
+        std::cout << "\nRunning performance benchmark..." << std::endl;
+        runBenchmark(blur, input, output, 100);
+
+        // Test with different kernel sizes
+        std::cout << "\n=== Testing Different Kernel Sizes ===" << std::endl;
+        std::vector<int> radii = {3, 5, 7, 10};
+
+        for (int r : radii) {
+            gpu::GaussianBlur blurTest(width, height, r, 2.0f);
+            Timer t;
+            t.start();
+            blurTest.apply(input.data(), output.data());
+            double time = t.elapsed();
+
+            int kernelSize = 2 * r + 1;
+            double throughput = (width * height / 1e6) / (time / 1000.0);
+
+            std::cout << "Radius " << r << " (kernel " << kernelSize 
+                      << "x" << kernelSize << "): " 
+                      << time << " ms, "
+                      << throughput << " Mpixels/sec" << std::endl;
+        }
+
+        std::cout << "\n=== Success! ===" << std::endl;
+
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return 1;
+    }
+
+    return 0;
+}
No results found