Skip to content

Instantly share code, notes, and snippets.

@raymondtay
Last active February 22, 2026 03:25
Show Gist options
  • Select an option

  • Save raymondtay/9915510013d22a5ac890652eedf9af03 to your computer and use it in GitHub Desktop.

Select an option

Save raymondtay/9915510013d22a5ac890652eedf9af03 to your computer and use it in GitHub Desktop.

Revisions

  1. raymondtay revised this gist Feb 22, 2026. 4 changed files with 797 additions and 0 deletions.
    87 changes: 87 additions & 0 deletions Makefile
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,87 @@
    # Makefile for Gaussian Blur CUDA implementations
    # Supports both standalone CUDA and C++ with CUDA versions

    # Compiler settings
    NVCC = nvcc
    CXX = g++

    # CUDA architecture (adjust based on your GPU)
    # Common options:
    # - sm_50: Maxwell (GTX 900 series)
    # - sm_60: Pascal (GTX 10 series)
    # - sm_70: Volta (Tesla V100)
    # - sm_75: Turing (RTX 20 series)
    # - sm_80: Ampere (RTX 30 series, A100)
    # - sm_86: Ampere (RTX 30 series mobile)
    # - sm_89: Ada Lovelace (RTX 40 series)
    CUDA_ARCH = sm_75

    # Compiler flags
    NVCC_FLAGS = -arch=$(CUDA_ARCH) -O3 --use_fast_math -Xcompiler -Wall
    CXX_FLAGS = -std=c++11 -Wall -O3

    # CUDA include and library paths (adjust if needed)
    CUDA_INC = /usr/local/cuda/include
    CUDA_LIB = /usr/local/cuda/lib64

    # Targets
    TARGETS = gaussian_blur_cuda gaussian_blur_cpp

    .PHONY: all clean

    all: $(TARGETS)

    # Standalone CUDA implementation
    gaussian_blur_cuda: gaussian_blur_cuda.cu
    $(NVCC) $(NVCC_FLAGS) $< -o $@
    @echo "Built standalone CUDA implementation: $@"

    # C++ with CUDA implementation (requires separate compilation)
    gaussian_blur_kernels.o: gaussian_blur_kernels.cu
    $(NVCC) $(NVCC_FLAGS) -c $< -o $@

    main.o: main.cpp gaussian_blur.hpp
    $(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@

    gaussian_blur_cpp: main.o gaussian_blur_kernels.o
    $(NVCC) $(NVCC_FLAGS) $^ -o $@
    @echo "Built C++ with CUDA implementation: $@"

    # Run targets
    run_cuda: gaussian_blur_cuda
    @echo "\n=== Running Standalone CUDA Implementation ==="
    ./gaussian_blur_cuda

    run_cpp: gaussian_blur_cpp
    @echo "\n=== Running C++ with CUDA Implementation ==="
    ./gaussian_blur_cpp

    run: run_cuda run_cpp

    # Clean build artifacts
    clean:
    rm -f $(TARGETS) *.o
    @echo "Cleaned build artifacts"

    # Help target
    help:
    @echo "Gaussian Blur CUDA Makefile"
    @echo ""
    @echo "Targets:"
    @echo " all - Build all implementations (default)"
    @echo " gaussian_blur_cuda - Build standalone CUDA version"
    @echo " gaussian_blur_cpp - Build C++ with CUDA version"
    @echo " run_cuda - Build and run standalone CUDA"
    @echo " run_cpp - Build and run C++ with CUDA"
    @echo " run - Run both implementations"
    @echo " clean - Remove build artifacts"
    @echo " help - Show this help message"
    @echo ""
    @echo "Configuration:"
    @echo " CUDA_ARCH=$(CUDA_ARCH) - Target GPU architecture"
    @echo ""
    @echo "Usage examples:"
    @echo " make # Build all"
    @echo " make run # Build and run both"
    @echo " make CUDA_ARCH=sm_80 # Build for Ampere (RTX 30 series)"
    @echo " make clean # Clean build files"
    250 changes: 250 additions & 0 deletions gaussian_blur.hpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,250 @@
    /*
    * Separable Gaussian Blur - C++ with CUDA Implementation
    *
    * This is a C++ wrapper around CUDA kernels providing:
    * - Object-oriented interface
    * - RAII memory management
    * - Support for multiple data types (float, uchar)
    * - Easy-to-use API
    */

    #ifndef GAUSSIAN_BLUR_HPP
    #define GAUSSIAN_BLUR_HPP

    #include <cuda_runtime.h>
    #include <vector>
    #include <memory>
    #include <stdexcept>
    #include <cmath>
    #include <iostream>

    // CUDA kernel declarations
    extern "C" {
    void launchGaussianBlurHorizontal(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius,
    dim3 gridSize,
    dim3 blockSize,
    int sharedMemSize
    );

    void launchGaussianBlurVertical(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius,
    dim3 gridSize,
    dim3 blockSize,
    int sharedMemSize
    );
    }

    namespace gpu {

    // RAII wrapper for CUDA device memory
    template<typename T>
    class DeviceBuffer {
    private:
    T* d_ptr_;
    size_t size_;

    public:
    DeviceBuffer(size_t size) : size_(size) {
    cudaError_t err = cudaMalloc(&d_ptr_, size * sizeof(T));
    if (err != cudaSuccess) {
    throw std::runtime_error("CUDA malloc failed: " +
    std::string(cudaGetErrorString(err)));
    }
    }

    ~DeviceBuffer() {
    if (d_ptr_) {
    cudaFree(d_ptr_);
    }
    }

    // Delete copy constructor and assignment
    DeviceBuffer(const DeviceBuffer&) = delete;
    DeviceBuffer& operator=(const DeviceBuffer&) = delete;

    // Move constructor and assignment
    DeviceBuffer(DeviceBuffer&& other) noexcept
    : d_ptr_(other.d_ptr_), size_(other.size_) {
    other.d_ptr_ = nullptr;
    }

    DeviceBuffer& operator=(DeviceBuffer&& other) noexcept {
    if (this != &other) {
    if (d_ptr_) cudaFree(d_ptr_);
    d_ptr_ = other.d_ptr_;
    size_ = other.size_;
    other.d_ptr_ = nullptr;
    }
    return *this;
    }

    T* get() { return d_ptr_; }
    const T* get() const { return d_ptr_; }
    size_t size() const { return size_; }

    void copyToDevice(const T* host_data) {
    cudaError_t err = cudaMemcpy(d_ptr_, host_data,
    size_ * sizeof(T), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
    throw std::runtime_error("Copy to device failed");
    }
    }

    void copyToHost(T* host_data) const {
    cudaError_t err = cudaMemcpy(host_data, d_ptr_,
    size_ * sizeof(T), cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
    throw std::runtime_error("Copy to host failed");
    }
    }
    };

    // Gaussian blur class
    class GaussianBlur {
    private:
    int width_;
    int height_;
    int radius_;
    float sigma_;
    std::vector<float> kernel_;

    std::unique_ptr<DeviceBuffer<float>> d_kernel_;
    std::unique_ptr<DeviceBuffer<float>> d_temp_;

    void generateKernel() {
    int kernelSize = 2 * radius_ + 1;
    kernel_.resize(kernelSize);

    float sum = 0.0f;
    for (int i = -radius_; i <= radius_; i++) {
    float value = std::exp(-(i * i) / (2.0f * sigma_ * sigma_));
    kernel_[i + radius_] = value;
    sum += value;
    }

    // Normalize
    for (auto& val : kernel_) {
    val /= sum;
    }
    }

    public:
    GaussianBlur(int width, int height, int radius, float sigma)
    : width_(width), height_(height), radius_(radius), sigma_(sigma) {

    if (radius <= 0 || radius > 50) {
    throw std::invalid_argument("Radius must be between 1 and 50");
    }

    if (sigma <= 0.0f) {
    throw std::invalid_argument("Sigma must be positive");
    }

    // Generate Gaussian kernel
    generateKernel();

    // Allocate device memory
    d_kernel_ = std::make_unique<DeviceBuffer<float>>(kernel_.size());
    d_kernel_->copyToDevice(kernel_.data());

    // Allocate temporary buffer for intermediate results
    d_temp_ = std::make_unique<DeviceBuffer<float>>(width * height);
    }

    // Apply Gaussian blur
    void apply(const float* h_input, float* h_output) {
    // Allocate device buffers
    DeviceBuffer<float> d_input(width_ * height_);
    DeviceBuffer<float> d_output(width_ * height_);

    // Copy input to device
    d_input.copyToDevice(h_input);

    // Configure kernel launch parameters
    const int blockSize = 16;
    dim3 blockDim(blockSize, blockSize);
    dim3 gridDim(
    (width_ + blockSize - 1) / blockSize,
    (height_ + blockSize - 1) / blockSize
    );

    int sharedMemSize = (blockSize + 2 * radius_) * sizeof(float);

    // Launch horizontal blur
    launchGaussianBlurHorizontal(
    d_input.get(),
    d_temp_->get(),
    d_kernel_->get(),
    width_,
    height_,
    radius_,
    gridDim,
    blockDim,
    sharedMemSize
    );

    // Check for kernel launch errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
    throw std::runtime_error("Horizontal kernel launch failed: " +
    std::string(cudaGetErrorString(err)));
    }

    // Launch vertical blur
    launchGaussianBlurVertical(
    d_temp_->get(),
    d_output.get(),
    d_kernel_->get(),
    width_,
    height_,
    radius_,
    gridDim,
    blockDim,
    sharedMemSize
    );

    err = cudaGetLastError();
    if (err != cudaSuccess) {
    throw std::runtime_error("Vertical kernel launch failed: " +
    std::string(cudaGetErrorString(err)));
    }

    // Synchronize
    cudaDeviceSynchronize();

    // Copy result back to host
    d_output.copyToHost(h_output);
    }

    // Getters
    int getWidth() const { return width_; }
    int getHeight() const { return height_; }
    int getRadius() const { return radius_; }
    float getSigma() const { return sigma_; }
    const std::vector<float>& getKernel() const { return kernel_; }

    // Print kernel coefficients
    void printKernel() const {
    std::cout << "Gaussian kernel (radius=" << radius_
    << ", sigma=" << sigma_ << "):\n";
    for (size_t i = 0; i < kernel_.size(); i++) {
    std::cout << kernel_[i] << " ";
    }
    std::cout << std::endl;
    }
    };

    } // namespace gpu

    #endif // GAUSSIAN_BLUR_HPP
    304 changes: 304 additions & 0 deletions gaussian_blur_cuda.cu
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,304 @@
    /*
    * Separable Gaussian Blur - Pure CUDA Implementation
    *
    * This implementation uses:
    * - Shared memory for efficient data reuse
    * - Separate kernels for horizontal and vertical passes
    * - Optimized memory access patterns
    */

    #include <cuda_runtime.h>
    #include <device_launch_parameters.h>
    #include <stdio.h>
    #include <math.h>

    #define KERNEL_RADIUS 5
    #define KERNEL_SIZE (2 * KERNEL_RADIUS + 1)
    #define BLOCK_SIZE 16

    // Error checking macro
    #define CUDA_CHECK(call) \
    do { \
    cudaError_t error = call; \
    if (error != cudaSuccess) { \
    fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
    cudaGetErrorString(error)); \
    exit(EXIT_FAILURE); \
    } \
    } while(0)

    /*
    * Generate 1D Gaussian kernel
    * Formula: G(x) = (1/sqrt(2*pi*sigma^2)) * exp(-x^2 / (2*sigma^2))
    */
    __host__ void generateGaussianKernel(float* kernel, int radius, float sigma) {
    float sum = 0.0f;

    for (int i = -radius; i <= radius; i++) {
    float value = expf(-(i * i) / (2.0f * sigma * sigma));
    kernel[i + radius] = value;
    sum += value;
    }

    // Normalize
    for (int i = 0; i < 2 * radius + 1; i++) {
    kernel[i] /= sum;
    }
    }

    /*
    * Horizontal Gaussian blur kernel
    * Each thread processes one pixel, reading from shared memory
    */
    __global__ void gaussianBlurHorizontal(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius
    ) {
    // Shared memory for the row (includes halo region)
    extern __shared__ float sharedRow[];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y >= height) return;

    // Load data into shared memory with halo
    int sharedIdx = threadIdx.x + radius;

    // Load main data
    if (x < width) {
    sharedRow[sharedIdx] = input[y * width + x];
    }

    // Load left halo
    if (threadIdx.x < radius) {
    int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
    if (leftX < 0) {
    sharedRow[threadIdx.x] = input[y * width + 0]; // Clamp
    } else {
    sharedRow[threadIdx.x] = input[y * width + leftX];
    }
    }

    // Load right halo
    if (threadIdx.x >= blockDim.x - radius) {
    int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
    int sharedRightIdx = threadIdx.x + 2 * radius;
    if (rightX >= width) {
    sharedRow[sharedRightIdx] = input[y * width + width - 1]; // Clamp
    } else {
    sharedRow[sharedRightIdx] = input[y * width + rightX];
    }
    }

    __syncthreads();

    // Perform convolution
    if (x < width) {
    float sum = 0.0f;

    for (int k = -radius; k <= radius; k++) {
    sum += sharedRow[sharedIdx + k] * kernel[k + radius];
    }

    output[y * width + x] = sum;
    }
    }

    /*
    * Vertical Gaussian blur kernel
    * Each thread processes one pixel, reading columns
    */
    __global__ void gaussianBlurVertical(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius
    ) {
    // Shared memory for the column (includes halo region)
    extern __shared__ float sharedCol[];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width) return;

    // Load data into shared memory with halo
    int sharedIdx = threadIdx.y + radius;

    // Load main data
    if (y < height) {
    sharedCol[sharedIdx] = input[y * width + x];
    }

    // Load top halo
    if (threadIdx.y < radius) {
    int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
    if (topY < 0) {
    sharedCol[threadIdx.y] = input[0 * width + x]; // Clamp
    } else {
    sharedCol[threadIdx.y] = input[topY * width + x];
    }
    }

    // Load bottom halo
    if (threadIdx.y >= blockDim.y - radius) {
    int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
    int sharedBottomIdx = threadIdx.y + 2 * radius;
    if (bottomY >= height) {
    sharedCol[sharedBottomIdx] = input[(height - 1) * width + x]; // Clamp
    } else {
    sharedCol[sharedBottomIdx] = input[bottomY * width + x];
    }
    }

    __syncthreads();

    // Perform convolution
    if (y < height) {
    float sum = 0.0f;

    for (int k = -radius; k <= radius; k++) {
    sum += sharedCol[sharedIdx + k] * kernel[k + radius];
    }

    output[y * width + x] = sum;
    }
    }

    /*
    * Apply separable Gaussian blur
    */
    void applySeparableGaussianBlur(
    const float* d_input,
    float* d_output,
    float* d_temp,
    const float* d_kernel,
    int width,
    int height,
    int radius
    ) {
    dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridSize(
    (width + blockSize.x - 1) / blockSize.x,
    (height + blockSize.y - 1) / blockSize.y
    );

    // Shared memory size (block size + 2 * radius for halo)
    int sharedMemSize = (BLOCK_SIZE + 2 * radius) * sizeof(float);

    // Horizontal pass
    gaussianBlurHorizontal<<<gridSize, blockSize, sharedMemSize>>>(
    d_input, d_temp, d_kernel, width, height, radius
    );
    CUDA_CHECK(cudaGetLastError());

    // Vertical pass
    gaussianBlurVertical<<<gridSize, blockSize, sharedMemSize>>>(
    d_temp, d_output, d_kernel, width, height, radius
    );
    CUDA_CHECK(cudaGetLastError());

    CUDA_CHECK(cudaDeviceSynchronize());
    }

    /*
    * Main function demonstrating usage
    */
    int main() {
    // Image dimensions
    const int width = 1920;
    const int height = 1080;
    const int imageSize = width * height * sizeof(float);

    // Gaussian parameters
    const int radius = KERNEL_RADIUS;
    const float sigma = 2.0f;
    const int kernelSize = 2 * radius + 1;

    printf("Separable Gaussian Blur (CUDA)\n");
    printf("Image size: %d x %d\n", width, height);
    printf("Kernel radius: %d (size: %d)\n", radius, kernelSize);
    printf("Sigma: %.2f\n\n", sigma);

    // Allocate host memory
    float* h_input = (float*)malloc(imageSize);
    float* h_output = (float*)malloc(imageSize);
    float* h_kernel = (float*)malloc(kernelSize * sizeof(float));

    // Initialize input with test pattern
    for (int i = 0; i < width * height; i++) {
    h_input[i] = (float)(rand() % 256) / 255.0f;
    }

    // Generate Gaussian kernel
    generateGaussianKernel(h_kernel, radius, sigma);

    printf("Gaussian kernel coefficients:\n");
    for (int i = 0; i < kernelSize; i++) {
    printf("%.6f ", h_kernel[i]);
    }
    printf("\n\n");

    // Allocate device memory
    float *d_input, *d_output, *d_temp, *d_kernel;
    CUDA_CHECK(cudaMalloc(&d_input, imageSize));
    CUDA_CHECK(cudaMalloc(&d_output, imageSize));
    CUDA_CHECK(cudaMalloc(&d_temp, imageSize));
    CUDA_CHECK(cudaMalloc(&d_kernel, kernelSize * sizeof(float)));

    // Copy data to device
    CUDA_CHECK(cudaMemcpy(d_input, h_input, imageSize, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice));

    // Create CUDA events for timing
    cudaEvent_t start, stop;
    CUDA_CHECK(cudaEventCreate(&start));
    CUDA_CHECK(cudaEventCreate(&stop));

    // Warm-up run
    applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);

    // Timed run
    CUDA_CHECK(cudaEventRecord(start));
    applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius);
    CUDA_CHECK(cudaEventRecord(stop));
    CUDA_CHECK(cudaEventSynchronize(stop));

    float milliseconds = 0;
    CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));

    printf("Execution time: %.3f ms\n", milliseconds);
    printf("Throughput: %.2f Mpixels/sec\n", (width * height / 1e6) / (milliseconds / 1000.0f));

    // Copy result back to host
    CUDA_CHECK(cudaMemcpy(h_output, d_output, imageSize, cudaMemcpyDeviceToHost));

    // Verify result (check a few pixels)
    printf("\nSample output values:\n");
    for (int i = 0; i < 5; i++) {
    int idx = (height / 2) * width + (width / 2) + i;
    printf("Pixel[%d][%d]: %.6f\n", height / 2, width / 2 + i, h_output[idx]);
    }

    // Cleanup
    CUDA_CHECK(cudaEventDestroy(start));
    CUDA_CHECK(cudaEventDestroy(stop));
    CUDA_CHECK(cudaFree(d_input));
    CUDA_CHECK(cudaFree(d_output));
    CUDA_CHECK(cudaFree(d_temp));
    CUDA_CHECK(cudaFree(d_kernel));
    free(h_input);
    free(h_output);
    free(h_kernel);

    printf("\nGaussian blur completed successfully!\n");

    return 0;
    }
    156 changes: 156 additions & 0 deletions gaussian_blur_kernels.cu
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,156 @@
    /*
    * CUDA Kernels Implementation
    * Separable Gaussian Blur
    */

    #include <cuda_runtime.h>
    #include <device_launch_parameters.h>

    /*
    * Horizontal Gaussian blur kernel with shared memory optimization
    */
    __global__ void gaussianBlurHorizontalKernel(
    const float* __restrict__ input,
    float* __restrict__ output,
    const float* __restrict__ kernel,
    int width,
    int height,
    int radius
    ) {
    extern __shared__ float sharedRow[];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (y >= height) return;

    int sharedIdx = threadIdx.x + radius;

    // Load center data
    if (x < width) {
    sharedRow[sharedIdx] = input[y * width + x];
    }

    // Load left halo
    if (threadIdx.x < radius) {
    int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x;
    sharedRow[threadIdx.x] = (leftX < 0) ?
    input[y * width] : input[y * width + leftX];
    }

    // Load right halo
    if (threadIdx.x >= blockDim.x - radius) {
    int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius;
    int sharedRightIdx = threadIdx.x + 2 * radius;
    sharedRow[sharedRightIdx] = (rightX >= width) ?
    input[y * width + width - 1] : input[y * width + rightX];
    }

    __syncthreads();

    // Perform convolution
    if (x < width) {
    float sum = 0.0f;

    #pragma unroll
    for (int k = -radius; k <= radius; k++) {
    sum += sharedRow[sharedIdx + k] * kernel[k + radius];
    }

    output[y * width + x] = sum;
    }
    }

    /*
    * Vertical Gaussian blur kernel with shared memory optimization
    */
    __global__ void gaussianBlurVerticalKernel(
    const float* __restrict__ input,
    float* __restrict__ output,
    const float* __restrict__ kernel,
    int width,
    int height,
    int radius
    ) {
    extern __shared__ float sharedCol[];

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x >= width) return;

    int sharedIdx = threadIdx.y + radius;

    // Load center data
    if (y < height) {
    sharedCol[sharedIdx] = input[y * width + x];
    }

    // Load top halo
    if (threadIdx.y < radius) {
    int topY = blockIdx.y * blockDim.y - radius + threadIdx.y;
    sharedCol[threadIdx.y] = (topY < 0) ?
    input[x] : input[topY * width + x];
    }

    // Load bottom halo
    if (threadIdx.y >= blockDim.y - radius) {
    int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius;
    int sharedBottomIdx = threadIdx.y + 2 * radius;
    sharedCol[sharedBottomIdx] = (bottomY >= height) ?
    input[(height - 1) * width + x] : input[bottomY * width + x];
    }

    __syncthreads();

    // Perform convolution
    if (y < height) {
    float sum = 0.0f;

    #pragma unroll
    for (int k = -radius; k <= radius; k++) {
    sum += sharedCol[sharedIdx + k] * kernel[k + radius];
    }

    output[y * width + x] = sum;
    }
    }

    /*
    * C-style wrapper functions for C++ code
    */
    extern "C" {

    void launchGaussianBlurHorizontal(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius,
    dim3 gridSize,
    dim3 blockSize,
    int sharedMemSize
    ) {
    gaussianBlurHorizontalKernel<<<gridSize, blockSize, sharedMemSize>>>(
    input, output, kernel, width, height, radius
    );
    }

    void launchGaussianBlurVertical(
    const float* input,
    float* output,
    const float* kernel,
    int width,
    int height,
    int radius,
    dim3 gridSize,
    dim3 blockSize,
    int sharedMemSize
    ) {
    gaussianBlurVerticalKernel<<<gridSize, blockSize, sharedMemSize>>>(
    input, output, kernel, width, height, radius
    );
    }

    } // extern "C"
  2. raymondtay created this gist Feb 22, 2026.
    205 changes: 205 additions & 0 deletions main.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,205 @@
    /*
    * Main Application - C++ with CUDA Gaussian Blur
    *
    * Demonstrates usage of the GaussianBlur class
    */

    #include "gaussian_blur.hpp"
    #include <iostream>
    #include <vector>
    #include <chrono>
    #include <random>
    #include <iomanip>

    // Simple image loader/saver simulation
    class Image {
    private:
    int width_;
    int height_;
    std::vector<float> data_;

    public:
    Image(int width, int height)
    : width_(width), height_(height), data_(width * height) {}

    void fillRandom() {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<float> dis(0.0f, 1.0f);

    for (auto& pixel : data_) {
    pixel = dis(gen);
    }
    }

    void fillTestPattern() {
    for (int y = 0; y < height_; y++) {
    for (int x = 0; x < width_; x++) {
    // Create a checkerboard pattern
    float value = ((x / 32) % 2) ^ ((y / 32) % 2) ? 1.0f : 0.0f;
    data_[y * width_ + x] = value;
    }
    }
    }

    float* data() { return data_.data(); }
    const float* data() const { return data_.data(); }
    int width() const { return width_; }
    int height() const { return height_; }
    size_t size() const { return data_.size(); }

    float getPixel(int x, int y) const {
    if (x < 0 || x >= width_ || y < 0 || y >= height_) {
    return 0.0f;
    }
    return data_[y * width_ + x];
    }

    void printRegion(int startX, int startY, int sizeX, int sizeY) const {
    std::cout << std::fixed << std::setprecision(3);
    for (int y = startY; y < startY + sizeY && y < height_; y++) {
    for (int x = startX; x < startX + sizeX && x < width_; x++) {
    std::cout << getPixel(x, y) << " ";
    }
    std::cout << std::endl;
    }
    }
    };

    // Benchmark helper
    class Timer {
    private:
    std::chrono::high_resolution_clock::time_point start_;

    public:
    void start() {
    start_ = std::chrono::high_resolution_clock::now();
    }

    double elapsed() const {
    auto end = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> duration = end - start_;
    return duration.count();
    }
    };

    // Run benchmark
    void runBenchmark(gpu::GaussianBlur& blur, const Image& input,
    Image& output, int iterations = 10) {
    Timer timer;
    std::vector<double> times;

    // Warm-up run
    blur.apply(input.data(), output.data());

    // Benchmark runs
    for (int i = 0; i < iterations; i++) {
    timer.start();
    blur.apply(input.data(), output.data());
    times.push_back(timer.elapsed());
    }

    // Calculate statistics
    double sum = 0.0;
    double minTime = times[0];
    double maxTime = times[0];

    for (double t : times) {
    sum += t;
    minTime = std::min(minTime, t);
    maxTime = std::max(maxTime, t);
    }

    double avgTime = sum / iterations;
    int pixels = input.width() * input.height();
    double throughput = (pixels / 1e6) / (avgTime / 1000.0);

    std::cout << "\n=== Benchmark Results ===" << std::endl;
    std::cout << "Iterations: " << iterations << std::endl;
    std::cout << "Average time: " << avgTime << " ms" << std::endl;
    std::cout << "Min time: " << minTime << " ms" << std::endl;
    std::cout << "Max time: " << maxTime << " ms" << std::endl;
    std::cout << "Throughput: " << throughput << " Mpixels/sec" << std::endl;
    }

    int main(int argc, char** argv) {
    std::cout << "=== Separable Gaussian Blur (C++ with CUDA) ===" << std::endl;
    std::cout << std::endl;

    // Configuration
    const int width = 1920;
    const int height = 1080;
    const int radius = 5;
    const float sigma = 2.0f;

    std::cout << "Image dimensions: " << width << " x " << height << std::endl;
    std::cout << "Kernel radius: " << radius << std::endl;
    std::cout << "Sigma: " << sigma << std::endl;
    std::cout << std::endl;

    try {
    // Create input and output images
    Image input(width, height);
    Image output(width, height);

    // Fill with test data
    std::cout << "Generating test image..." << std::endl;
    input.fillRandom();

    // Create Gaussian blur processor
    std::cout << "Initializing GPU processor..." << std::endl;
    gpu::GaussianBlur blur(width, height, radius, sigma);

    // Print kernel
    blur.printKernel();
    std::cout << std::endl;

    // Apply blur
    std::cout << "Applying Gaussian blur..." << std::endl;
    Timer timer;
    timer.start();
    blur.apply(input.data(), output.data());
    double elapsed = timer.elapsed();

    std::cout << "First run completed in " << elapsed << " ms" << std::endl;

    // Show sample results
    std::cout << "\nSample input region (center 5x5):" << std::endl;
    input.printRegion(width/2 - 2, height/2 - 2, 5, 5);

    std::cout << "\nSample output region (center 5x5):" << std::endl;
    output.printRegion(width/2 - 2, height/2 - 2, 5, 5);

    // Run benchmark
    std::cout << "\nRunning performance benchmark..." << std::endl;
    runBenchmark(blur, input, output, 100);

    // Test with different kernel sizes
    std::cout << "\n=== Testing Different Kernel Sizes ===" << std::endl;
    std::vector<int> radii = {3, 5, 7, 10};

    for (int r : radii) {
    gpu::GaussianBlur blurTest(width, height, r, 2.0f);
    Timer t;
    t.start();
    blurTest.apply(input.data(), output.data());
    double time = t.elapsed();

    int kernelSize = 2 * r + 1;
    double throughput = (width * height / 1e6) / (time / 1000.0);

    std::cout << "Radius " << r << " (kernel " << kernelSize
    << "x" << kernelSize << "): "
    << time << " ms, "
    << throughput << " Mpixels/sec" << std::endl;
    }

    std::cout << "\n=== Success! ===" << std::endl;

    } catch (const std::exception& e) {
    std::cerr << "Error: " << e.what() << std::endl;
    return 1;
    }

    return 0;
    }