Last active
February 22, 2026 03:25
-
-
Save raymondtay/9915510013d22a5ac890652eedf9af03 to your computer and use it in GitHub Desktop.
Revisions
-
raymondtay revised this gist
Feb 22, 2026 . 4 changed files with 797 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,87 @@ # Makefile for Gaussian Blur CUDA implementations # Supports both standalone CUDA and C++ with CUDA versions # Compiler settings NVCC = nvcc CXX = g++ # CUDA architecture (adjust based on your GPU) # Common options: # - sm_50: Maxwell (GTX 900 series) # - sm_60: Pascal (GTX 10 series) # - sm_70: Volta (Tesla V100) # - sm_75: Turing (RTX 20 series) # - sm_80: Ampere (RTX 30 series, A100) # - sm_86: Ampere (RTX 30 series mobile) # - sm_89: Ada Lovelace (RTX 40 series) CUDA_ARCH = sm_75 # Compiler flags NVCC_FLAGS = -arch=$(CUDA_ARCH) -O3 --use_fast_math -Xcompiler -Wall CXX_FLAGS = -std=c++11 -Wall -O3 # CUDA include and library paths (adjust if needed) CUDA_INC = /usr/local/cuda/include CUDA_LIB = /usr/local/cuda/lib64 # Targets TARGETS = gaussian_blur_cuda gaussian_blur_cpp .PHONY: all clean all: $(TARGETS) # Standalone CUDA implementation gaussian_blur_cuda: gaussian_blur_cuda.cu $(NVCC) $(NVCC_FLAGS) $< -o $@ @echo "Built standalone CUDA implementation: $@" # C++ with CUDA implementation (requires separate compilation) gaussian_blur_kernels.o: gaussian_blur_kernels.cu $(NVCC) $(NVCC_FLAGS) -c $< -o $@ main.o: main.cpp gaussian_blur.hpp $(NVCC) $(NVCC_FLAGS) -x cu -c $< -o $@ gaussian_blur_cpp: main.o gaussian_blur_kernels.o $(NVCC) $(NVCC_FLAGS) $^ -o $@ @echo "Built C++ with CUDA implementation: $@" # Run targets run_cuda: gaussian_blur_cuda @echo "\n=== Running Standalone CUDA Implementation ===" ./gaussian_blur_cuda run_cpp: gaussian_blur_cpp @echo "\n=== Running C++ with CUDA Implementation ===" ./gaussian_blur_cpp run: run_cuda run_cpp # Clean build artifacts clean: rm -f $(TARGETS) *.o @echo "Cleaned build artifacts" # Help target help: @echo "Gaussian Blur CUDA Makefile" @echo "" @echo "Targets:" @echo " all - Build all implementations (default)" @echo " gaussian_blur_cuda - Build standalone CUDA version" @echo " gaussian_blur_cpp - Build C++ with CUDA version" @echo " run_cuda - Build and run standalone CUDA" @echo " run_cpp - Build and run C++ with CUDA" @echo " run - Run both implementations" @echo " clean - Remove build artifacts" @echo " help - Show this help message" @echo "" @echo "Configuration:" @echo " CUDA_ARCH=$(CUDA_ARCH) - Target GPU architecture" @echo "" @echo "Usage examples:" @echo " make # Build all" @echo " make run # Build and run both" @echo " make CUDA_ARCH=sm_80 # Build for Ampere (RTX 30 series)" @echo " make clean # Clean build files" This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,250 @@ /* * Separable Gaussian Blur - C++ with CUDA Implementation * * This is a C++ wrapper around CUDA kernels providing: * - Object-oriented interface * - RAII memory management * - Support for multiple data types (float, uchar) * - Easy-to-use API */ #ifndef GAUSSIAN_BLUR_HPP #define GAUSSIAN_BLUR_HPP #include <cuda_runtime.h> #include <vector> #include <memory> #include <stdexcept> #include <cmath> #include <iostream> // CUDA kernel declarations extern "C" { void launchGaussianBlurHorizontal( const float* input, float* output, const float* kernel, int width, int height, int radius, dim3 gridSize, dim3 blockSize, int sharedMemSize ); void launchGaussianBlurVertical( const float* input, float* output, const float* kernel, int width, int height, int radius, dim3 gridSize, dim3 blockSize, int sharedMemSize ); } namespace gpu { // RAII wrapper for CUDA device memory template<typename T> class DeviceBuffer { private: T* d_ptr_; size_t size_; public: DeviceBuffer(size_t size) : size_(size) { cudaError_t err = cudaMalloc(&d_ptr_, size * sizeof(T)); if (err != cudaSuccess) { throw std::runtime_error("CUDA malloc failed: " + std::string(cudaGetErrorString(err))); } } ~DeviceBuffer() { if (d_ptr_) { cudaFree(d_ptr_); } } // Delete copy constructor and assignment DeviceBuffer(const DeviceBuffer&) = delete; DeviceBuffer& operator=(const DeviceBuffer&) = delete; // Move constructor and assignment DeviceBuffer(DeviceBuffer&& other) noexcept : d_ptr_(other.d_ptr_), size_(other.size_) { other.d_ptr_ = nullptr; } DeviceBuffer& operator=(DeviceBuffer&& other) noexcept { if (this != &other) { if (d_ptr_) cudaFree(d_ptr_); d_ptr_ = other.d_ptr_; size_ = other.size_; other.d_ptr_ = nullptr; } return *this; } T* get() { return d_ptr_; } const T* get() const { return d_ptr_; } size_t size() const { return size_; } void copyToDevice(const T* host_data) { cudaError_t err = cudaMemcpy(d_ptr_, host_data, size_ * sizeof(T), cudaMemcpyHostToDevice); if (err != cudaSuccess) { throw std::runtime_error("Copy to device failed"); } } void copyToHost(T* host_data) const { cudaError_t err = cudaMemcpy(host_data, d_ptr_, size_ * sizeof(T), cudaMemcpyDeviceToHost); if (err != cudaSuccess) { throw std::runtime_error("Copy to host failed"); } } }; // Gaussian blur class class GaussianBlur { private: int width_; int height_; int radius_; float sigma_; std::vector<float> kernel_; std::unique_ptr<DeviceBuffer<float>> d_kernel_; std::unique_ptr<DeviceBuffer<float>> d_temp_; void generateKernel() { int kernelSize = 2 * radius_ + 1; kernel_.resize(kernelSize); float sum = 0.0f; for (int i = -radius_; i <= radius_; i++) { float value = std::exp(-(i * i) / (2.0f * sigma_ * sigma_)); kernel_[i + radius_] = value; sum += value; } // Normalize for (auto& val : kernel_) { val /= sum; } } public: GaussianBlur(int width, int height, int radius, float sigma) : width_(width), height_(height), radius_(radius), sigma_(sigma) { if (radius <= 0 || radius > 50) { throw std::invalid_argument("Radius must be between 1 and 50"); } if (sigma <= 0.0f) { throw std::invalid_argument("Sigma must be positive"); } // Generate Gaussian kernel generateKernel(); // Allocate device memory d_kernel_ = std::make_unique<DeviceBuffer<float>>(kernel_.size()); d_kernel_->copyToDevice(kernel_.data()); // Allocate temporary buffer for intermediate results d_temp_ = std::make_unique<DeviceBuffer<float>>(width * height); } // Apply Gaussian blur void apply(const float* h_input, float* h_output) { // Allocate device buffers DeviceBuffer<float> d_input(width_ * height_); DeviceBuffer<float> d_output(width_ * height_); // Copy input to device d_input.copyToDevice(h_input); // Configure kernel launch parameters const int blockSize = 16; dim3 blockDim(blockSize, blockSize); dim3 gridDim( (width_ + blockSize - 1) / blockSize, (height_ + blockSize - 1) / blockSize ); int sharedMemSize = (blockSize + 2 * radius_) * sizeof(float); // Launch horizontal blur launchGaussianBlurHorizontal( d_input.get(), d_temp_->get(), d_kernel_->get(), width_, height_, radius_, gridDim, blockDim, sharedMemSize ); // Check for kernel launch errors cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { throw std::runtime_error("Horizontal kernel launch failed: " + std::string(cudaGetErrorString(err))); } // Launch vertical blur launchGaussianBlurVertical( d_temp_->get(), d_output.get(), d_kernel_->get(), width_, height_, radius_, gridDim, blockDim, sharedMemSize ); err = cudaGetLastError(); if (err != cudaSuccess) { throw std::runtime_error("Vertical kernel launch failed: " + std::string(cudaGetErrorString(err))); } // Synchronize cudaDeviceSynchronize(); // Copy result back to host d_output.copyToHost(h_output); } // Getters int getWidth() const { return width_; } int getHeight() const { return height_; } int getRadius() const { return radius_; } float getSigma() const { return sigma_; } const std::vector<float>& getKernel() const { return kernel_; } // Print kernel coefficients void printKernel() const { std::cout << "Gaussian kernel (radius=" << radius_ << ", sigma=" << sigma_ << "):\n"; for (size_t i = 0; i < kernel_.size(); i++) { std::cout << kernel_[i] << " "; } std::cout << std::endl; } }; } // namespace gpu #endif // GAUSSIAN_BLUR_HPP This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,304 @@ /* * Separable Gaussian Blur - Pure CUDA Implementation * * This implementation uses: * - Shared memory for efficient data reuse * - Separate kernels for horizontal and vertical passes * - Optimized memory access patterns */ #include <cuda_runtime.h> #include <device_launch_parameters.h> #include <stdio.h> #include <math.h> #define KERNEL_RADIUS 5 #define KERNEL_SIZE (2 * KERNEL_RADIUS + 1) #define BLOCK_SIZE 16 // Error checking macro #define CUDA_CHECK(call) \ do { \ cudaError_t error = call; \ if (error != cudaSuccess) { \ fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ cudaGetErrorString(error)); \ exit(EXIT_FAILURE); \ } \ } while(0) /* * Generate 1D Gaussian kernel * Formula: G(x) = (1/sqrt(2*pi*sigma^2)) * exp(-x^2 / (2*sigma^2)) */ __host__ void generateGaussianKernel(float* kernel, int radius, float sigma) { float sum = 0.0f; for (int i = -radius; i <= radius; i++) { float value = expf(-(i * i) / (2.0f * sigma * sigma)); kernel[i + radius] = value; sum += value; } // Normalize for (int i = 0; i < 2 * radius + 1; i++) { kernel[i] /= sum; } } /* * Horizontal Gaussian blur kernel * Each thread processes one pixel, reading from shared memory */ __global__ void gaussianBlurHorizontal( const float* input, float* output, const float* kernel, int width, int height, int radius ) { // Shared memory for the row (includes halo region) extern __shared__ float sharedRow[]; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (y >= height) return; // Load data into shared memory with halo int sharedIdx = threadIdx.x + radius; // Load main data if (x < width) { sharedRow[sharedIdx] = input[y * width + x]; } // Load left halo if (threadIdx.x < radius) { int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x; if (leftX < 0) { sharedRow[threadIdx.x] = input[y * width + 0]; // Clamp } else { sharedRow[threadIdx.x] = input[y * width + leftX]; } } // Load right halo if (threadIdx.x >= blockDim.x - radius) { int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius; int sharedRightIdx = threadIdx.x + 2 * radius; if (rightX >= width) { sharedRow[sharedRightIdx] = input[y * width + width - 1]; // Clamp } else { sharedRow[sharedRightIdx] = input[y * width + rightX]; } } __syncthreads(); // Perform convolution if (x < width) { float sum = 0.0f; for (int k = -radius; k <= radius; k++) { sum += sharedRow[sharedIdx + k] * kernel[k + radius]; } output[y * width + x] = sum; } } /* * Vertical Gaussian blur kernel * Each thread processes one pixel, reading columns */ __global__ void gaussianBlurVertical( const float* input, float* output, const float* kernel, int width, int height, int radius ) { // Shared memory for the column (includes halo region) extern __shared__ float sharedCol[]; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width) return; // Load data into shared memory with halo int sharedIdx = threadIdx.y + radius; // Load main data if (y < height) { sharedCol[sharedIdx] = input[y * width + x]; } // Load top halo if (threadIdx.y < radius) { int topY = blockIdx.y * blockDim.y - radius + threadIdx.y; if (topY < 0) { sharedCol[threadIdx.y] = input[0 * width + x]; // Clamp } else { sharedCol[threadIdx.y] = input[topY * width + x]; } } // Load bottom halo if (threadIdx.y >= blockDim.y - radius) { int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius; int sharedBottomIdx = threadIdx.y + 2 * radius; if (bottomY >= height) { sharedCol[sharedBottomIdx] = input[(height - 1) * width + x]; // Clamp } else { sharedCol[sharedBottomIdx] = input[bottomY * width + x]; } } __syncthreads(); // Perform convolution if (y < height) { float sum = 0.0f; for (int k = -radius; k <= radius; k++) { sum += sharedCol[sharedIdx + k] * kernel[k + radius]; } output[y * width + x] = sum; } } /* * Apply separable Gaussian blur */ void applySeparableGaussianBlur( const float* d_input, float* d_output, float* d_temp, const float* d_kernel, int width, int height, int radius ) { dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE); dim3 gridSize( (width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y ); // Shared memory size (block size + 2 * radius for halo) int sharedMemSize = (BLOCK_SIZE + 2 * radius) * sizeof(float); // Horizontal pass gaussianBlurHorizontal<<<gridSize, blockSize, sharedMemSize>>>( d_input, d_temp, d_kernel, width, height, radius ); CUDA_CHECK(cudaGetLastError()); // Vertical pass gaussianBlurVertical<<<gridSize, blockSize, sharedMemSize>>>( d_temp, d_output, d_kernel, width, height, radius ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } /* * Main function demonstrating usage */ int main() { // Image dimensions const int width = 1920; const int height = 1080; const int imageSize = width * height * sizeof(float); // Gaussian parameters const int radius = KERNEL_RADIUS; const float sigma = 2.0f; const int kernelSize = 2 * radius + 1; printf("Separable Gaussian Blur (CUDA)\n"); printf("Image size: %d x %d\n", width, height); printf("Kernel radius: %d (size: %d)\n", radius, kernelSize); printf("Sigma: %.2f\n\n", sigma); // Allocate host memory float* h_input = (float*)malloc(imageSize); float* h_output = (float*)malloc(imageSize); float* h_kernel = (float*)malloc(kernelSize * sizeof(float)); // Initialize input with test pattern for (int i = 0; i < width * height; i++) { h_input[i] = (float)(rand() % 256) / 255.0f; } // Generate Gaussian kernel generateGaussianKernel(h_kernel, radius, sigma); printf("Gaussian kernel coefficients:\n"); for (int i = 0; i < kernelSize; i++) { printf("%.6f ", h_kernel[i]); } printf("\n\n"); // Allocate device memory float *d_input, *d_output, *d_temp, *d_kernel; CUDA_CHECK(cudaMalloc(&d_input, imageSize)); CUDA_CHECK(cudaMalloc(&d_output, imageSize)); CUDA_CHECK(cudaMalloc(&d_temp, imageSize)); CUDA_CHECK(cudaMalloc(&d_kernel, kernelSize * sizeof(float))); // Copy data to device CUDA_CHECK(cudaMemcpy(d_input, h_input, imageSize, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice)); // Create CUDA events for timing cudaEvent_t start, stop; CUDA_CHECK(cudaEventCreate(&start)); CUDA_CHECK(cudaEventCreate(&stop)); // Warm-up run applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius); // Timed run CUDA_CHECK(cudaEventRecord(start)); applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius); CUDA_CHECK(cudaEventRecord(stop)); CUDA_CHECK(cudaEventSynchronize(stop)); float milliseconds = 0; CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop)); printf("Execution time: %.3f ms\n", milliseconds); printf("Throughput: %.2f Mpixels/sec\n", (width * height / 1e6) / (milliseconds / 1000.0f)); // Copy result back to host CUDA_CHECK(cudaMemcpy(h_output, d_output, imageSize, cudaMemcpyDeviceToHost)); // Verify result (check a few pixels) printf("\nSample output values:\n"); for (int i = 0; i < 5; i++) { int idx = (height / 2) * width + (width / 2) + i; printf("Pixel[%d][%d]: %.6f\n", height / 2, width / 2 + i, h_output[idx]); } // Cleanup CUDA_CHECK(cudaEventDestroy(start)); CUDA_CHECK(cudaEventDestroy(stop)); CUDA_CHECK(cudaFree(d_input)); CUDA_CHECK(cudaFree(d_output)); CUDA_CHECK(cudaFree(d_temp)); CUDA_CHECK(cudaFree(d_kernel)); free(h_input); free(h_output); free(h_kernel); printf("\nGaussian blur completed successfully!\n"); return 0; } This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,156 @@ /* * CUDA Kernels Implementation * Separable Gaussian Blur */ #include <cuda_runtime.h> #include <device_launch_parameters.h> /* * Horizontal Gaussian blur kernel with shared memory optimization */ __global__ void gaussianBlurHorizontalKernel( const float* __restrict__ input, float* __restrict__ output, const float* __restrict__ kernel, int width, int height, int radius ) { extern __shared__ float sharedRow[]; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (y >= height) return; int sharedIdx = threadIdx.x + radius; // Load center data if (x < width) { sharedRow[sharedIdx] = input[y * width + x]; } // Load left halo if (threadIdx.x < radius) { int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x; sharedRow[threadIdx.x] = (leftX < 0) ? input[y * width] : input[y * width + leftX]; } // Load right halo if (threadIdx.x >= blockDim.x - radius) { int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius; int sharedRightIdx = threadIdx.x + 2 * radius; sharedRow[sharedRightIdx] = (rightX >= width) ? input[y * width + width - 1] : input[y * width + rightX]; } __syncthreads(); // Perform convolution if (x < width) { float sum = 0.0f; #pragma unroll for (int k = -radius; k <= radius; k++) { sum += sharedRow[sharedIdx + k] * kernel[k + radius]; } output[y * width + x] = sum; } } /* * Vertical Gaussian blur kernel with shared memory optimization */ __global__ void gaussianBlurVerticalKernel( const float* __restrict__ input, float* __restrict__ output, const float* __restrict__ kernel, int width, int height, int radius ) { extern __shared__ float sharedCol[]; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width) return; int sharedIdx = threadIdx.y + radius; // Load center data if (y < height) { sharedCol[sharedIdx] = input[y * width + x]; } // Load top halo if (threadIdx.y < radius) { int topY = blockIdx.y * blockDim.y - radius + threadIdx.y; sharedCol[threadIdx.y] = (topY < 0) ? input[x] : input[topY * width + x]; } // Load bottom halo if (threadIdx.y >= blockDim.y - radius) { int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius; int sharedBottomIdx = threadIdx.y + 2 * radius; sharedCol[sharedBottomIdx] = (bottomY >= height) ? input[(height - 1) * width + x] : input[bottomY * width + x]; } __syncthreads(); // Perform convolution if (y < height) { float sum = 0.0f; #pragma unroll for (int k = -radius; k <= radius; k++) { sum += sharedCol[sharedIdx + k] * kernel[k + radius]; } output[y * width + x] = sum; } } /* * C-style wrapper functions for C++ code */ extern "C" { void launchGaussianBlurHorizontal( const float* input, float* output, const float* kernel, int width, int height, int radius, dim3 gridSize, dim3 blockSize, int sharedMemSize ) { gaussianBlurHorizontalKernel<<<gridSize, blockSize, sharedMemSize>>>( input, output, kernel, width, height, radius ); } void launchGaussianBlurVertical( const float* input, float* output, const float* kernel, int width, int height, int radius, dim3 gridSize, dim3 blockSize, int sharedMemSize ) { gaussianBlurVerticalKernel<<<gridSize, blockSize, sharedMemSize>>>( input, output, kernel, width, height, radius ); } } // extern "C" -
raymondtay created this gist
Feb 22, 2026 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,205 @@ /* * Main Application - C++ with CUDA Gaussian Blur * * Demonstrates usage of the GaussianBlur class */ #include "gaussian_blur.hpp" #include <iostream> #include <vector> #include <chrono> #include <random> #include <iomanip> // Simple image loader/saver simulation class Image { private: int width_; int height_; std::vector<float> data_; public: Image(int width, int height) : width_(width), height_(height), data_(width * height) {} void fillRandom() { std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution<float> dis(0.0f, 1.0f); for (auto& pixel : data_) { pixel = dis(gen); } } void fillTestPattern() { for (int y = 0; y < height_; y++) { for (int x = 0; x < width_; x++) { // Create a checkerboard pattern float value = ((x / 32) % 2) ^ ((y / 32) % 2) ? 1.0f : 0.0f; data_[y * width_ + x] = value; } } } float* data() { return data_.data(); } const float* data() const { return data_.data(); } int width() const { return width_; } int height() const { return height_; } size_t size() const { return data_.size(); } float getPixel(int x, int y) const { if (x < 0 || x >= width_ || y < 0 || y >= height_) { return 0.0f; } return data_[y * width_ + x]; } void printRegion(int startX, int startY, int sizeX, int sizeY) const { std::cout << std::fixed << std::setprecision(3); for (int y = startY; y < startY + sizeY && y < height_; y++) { for (int x = startX; x < startX + sizeX && x < width_; x++) { std::cout << getPixel(x, y) << " "; } std::cout << std::endl; } } }; // Benchmark helper class Timer { private: std::chrono::high_resolution_clock::time_point start_; public: void start() { start_ = std::chrono::high_resolution_clock::now(); } double elapsed() const { auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double, std::milli> duration = end - start_; return duration.count(); } }; // Run benchmark void runBenchmark(gpu::GaussianBlur& blur, const Image& input, Image& output, int iterations = 10) { Timer timer; std::vector<double> times; // Warm-up run blur.apply(input.data(), output.data()); // Benchmark runs for (int i = 0; i < iterations; i++) { timer.start(); blur.apply(input.data(), output.data()); times.push_back(timer.elapsed()); } // Calculate statistics double sum = 0.0; double minTime = times[0]; double maxTime = times[0]; for (double t : times) { sum += t; minTime = std::min(minTime, t); maxTime = std::max(maxTime, t); } double avgTime = sum / iterations; int pixels = input.width() * input.height(); double throughput = (pixels / 1e6) / (avgTime / 1000.0); std::cout << "\n=== Benchmark Results ===" << std::endl; std::cout << "Iterations: " << iterations << std::endl; std::cout << "Average time: " << avgTime << " ms" << std::endl; std::cout << "Min time: " << minTime << " ms" << std::endl; std::cout << "Max time: " << maxTime << " ms" << std::endl; std::cout << "Throughput: " << throughput << " Mpixels/sec" << std::endl; } int main(int argc, char** argv) { std::cout << "=== Separable Gaussian Blur (C++ with CUDA) ===" << std::endl; std::cout << std::endl; // Configuration const int width = 1920; const int height = 1080; const int radius = 5; const float sigma = 2.0f; std::cout << "Image dimensions: " << width << " x " << height << std::endl; std::cout << "Kernel radius: " << radius << std::endl; std::cout << "Sigma: " << sigma << std::endl; std::cout << std::endl; try { // Create input and output images Image input(width, height); Image output(width, height); // Fill with test data std::cout << "Generating test image..." << std::endl; input.fillRandom(); // Create Gaussian blur processor std::cout << "Initializing GPU processor..." << std::endl; gpu::GaussianBlur blur(width, height, radius, sigma); // Print kernel blur.printKernel(); std::cout << std::endl; // Apply blur std::cout << "Applying Gaussian blur..." << std::endl; Timer timer; timer.start(); blur.apply(input.data(), output.data()); double elapsed = timer.elapsed(); std::cout << "First run completed in " << elapsed << " ms" << std::endl; // Show sample results std::cout << "\nSample input region (center 5x5):" << std::endl; input.printRegion(width/2 - 2, height/2 - 2, 5, 5); std::cout << "\nSample output region (center 5x5):" << std::endl; output.printRegion(width/2 - 2, height/2 - 2, 5, 5); // Run benchmark std::cout << "\nRunning performance benchmark..." << std::endl; runBenchmark(blur, input, output, 100); // Test with different kernel sizes std::cout << "\n=== Testing Different Kernel Sizes ===" << std::endl; std::vector<int> radii = {3, 5, 7, 10}; for (int r : radii) { gpu::GaussianBlur blurTest(width, height, r, 2.0f); Timer t; t.start(); blurTest.apply(input.data(), output.data()); double time = t.elapsed(); int kernelSize = 2 * r + 1; double throughput = (width * height / 1e6) / (time / 1000.0); std::cout << "Radius " << r << " (kernel " << kernelSize << "x" << kernelSize << "): " << time << " ms, " << throughput << " Mpixels/sec" << std::endl; } std::cout << "\n=== Success! ===" << std::endl; } catch (const std::exception& e) { std::cerr << "Error: " << e.what() << std::endl; return 1; } return 0; }