/* * Separable Gaussian Blur - Pure CUDA Implementation * * This implementation uses: * - Shared memory for efficient data reuse * - Separate kernels for horizontal and vertical passes * - Optimized memory access patterns */ #include #include #include #include #define KERNEL_RADIUS 5 #define KERNEL_SIZE (2 * KERNEL_RADIUS + 1) #define BLOCK_SIZE 16 // Error checking macro #define CUDA_CHECK(call) \ do { \ cudaError_t error = call; \ if (error != cudaSuccess) { \ fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ cudaGetErrorString(error)); \ exit(EXIT_FAILURE); \ } \ } while(0) /* * Generate 1D Gaussian kernel * Formula: G(x) = (1/sqrt(2*pi*sigma^2)) * exp(-x^2 / (2*sigma^2)) */ __host__ void generateGaussianKernel(float* kernel, int radius, float sigma) { float sum = 0.0f; for (int i = -radius; i <= radius; i++) { float value = expf(-(i * i) / (2.0f * sigma * sigma)); kernel[i + radius] = value; sum += value; } // Normalize for (int i = 0; i < 2 * radius + 1; i++) { kernel[i] /= sum; } } /* * Horizontal Gaussian blur kernel * Each thread processes one pixel, reading from shared memory */ __global__ void gaussianBlurHorizontal( const float* input, float* output, const float* kernel, int width, int height, int radius ) { // Shared memory for the row (includes halo region) extern __shared__ float sharedRow[]; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (y >= height) return; // Load data into shared memory with halo int sharedIdx = threadIdx.x + radius; // Load main data if (x < width) { sharedRow[sharedIdx] = input[y * width + x]; } // Load left halo if (threadIdx.x < radius) { int leftX = blockIdx.x * blockDim.x - radius + threadIdx.x; if (leftX < 0) { sharedRow[threadIdx.x] = input[y * width + 0]; // Clamp } else { sharedRow[threadIdx.x] = input[y * width + leftX]; } } // Load right halo if (threadIdx.x >= blockDim.x - radius) { int rightX = blockIdx.x * blockDim.x + threadIdx.x + radius; int sharedRightIdx = threadIdx.x + 2 * radius; if (rightX >= width) { sharedRow[sharedRightIdx] = input[y * width + width - 1]; // Clamp } else { sharedRow[sharedRightIdx] = input[y * width + rightX]; } } __syncthreads(); // Perform convolution if (x < width) { float sum = 0.0f; for (int k = -radius; k <= radius; k++) { sum += sharedRow[sharedIdx + k] * kernel[k + radius]; } output[y * width + x] = sum; } } /* * Vertical Gaussian blur kernel * Each thread processes one pixel, reading columns */ __global__ void gaussianBlurVertical( const float* input, float* output, const float* kernel, int width, int height, int radius ) { // Shared memory for the column (includes halo region) extern __shared__ float sharedCol[]; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x >= width) return; // Load data into shared memory with halo int sharedIdx = threadIdx.y + radius; // Load main data if (y < height) { sharedCol[sharedIdx] = input[y * width + x]; } // Load top halo if (threadIdx.y < radius) { int topY = blockIdx.y * blockDim.y - radius + threadIdx.y; if (topY < 0) { sharedCol[threadIdx.y] = input[0 * width + x]; // Clamp } else { sharedCol[threadIdx.y] = input[topY * width + x]; } } // Load bottom halo if (threadIdx.y >= blockDim.y - radius) { int bottomY = blockIdx.y * blockDim.y + threadIdx.y + radius; int sharedBottomIdx = threadIdx.y + 2 * radius; if (bottomY >= height) { sharedCol[sharedBottomIdx] = input[(height - 1) * width + x]; // Clamp } else { sharedCol[sharedBottomIdx] = input[bottomY * width + x]; } } __syncthreads(); // Perform convolution if (y < height) { float sum = 0.0f; for (int k = -radius; k <= radius; k++) { sum += sharedCol[sharedIdx + k] * kernel[k + radius]; } output[y * width + x] = sum; } } /* * Apply separable Gaussian blur */ void applySeparableGaussianBlur( const float* d_input, float* d_output, float* d_temp, const float* d_kernel, int width, int height, int radius ) { dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE); dim3 gridSize( (width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y ); // Shared memory size (block size + 2 * radius for halo) int sharedMemSize = (BLOCK_SIZE + 2 * radius) * sizeof(float); // Horizontal pass gaussianBlurHorizontal<<>>( d_input, d_temp, d_kernel, width, height, radius ); CUDA_CHECK(cudaGetLastError()); // Vertical pass gaussianBlurVertical<<>>( d_temp, d_output, d_kernel, width, height, radius ); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); } /* * Main function demonstrating usage */ int main() { // Image dimensions const int width = 1920; const int height = 1080; const int imageSize = width * height * sizeof(float); // Gaussian parameters const int radius = KERNEL_RADIUS; const float sigma = 2.0f; const int kernelSize = 2 * radius + 1; printf("Separable Gaussian Blur (CUDA)\n"); printf("Image size: %d x %d\n", width, height); printf("Kernel radius: %d (size: %d)\n", radius, kernelSize); printf("Sigma: %.2f\n\n", sigma); // Allocate host memory float* h_input = (float*)malloc(imageSize); float* h_output = (float*)malloc(imageSize); float* h_kernel = (float*)malloc(kernelSize * sizeof(float)); // Initialize input with test pattern for (int i = 0; i < width * height; i++) { h_input[i] = (float)(rand() % 256) / 255.0f; } // Generate Gaussian kernel generateGaussianKernel(h_kernel, radius, sigma); printf("Gaussian kernel coefficients:\n"); for (int i = 0; i < kernelSize; i++) { printf("%.6f ", h_kernel[i]); } printf("\n\n"); // Allocate device memory float *d_input, *d_output, *d_temp, *d_kernel; CUDA_CHECK(cudaMalloc(&d_input, imageSize)); CUDA_CHECK(cudaMalloc(&d_output, imageSize)); CUDA_CHECK(cudaMalloc(&d_temp, imageSize)); CUDA_CHECK(cudaMalloc(&d_kernel, kernelSize * sizeof(float))); // Copy data to device CUDA_CHECK(cudaMemcpy(d_input, h_input, imageSize, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice)); // Create CUDA events for timing cudaEvent_t start, stop; CUDA_CHECK(cudaEventCreate(&start)); CUDA_CHECK(cudaEventCreate(&stop)); // Warm-up run applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius); // Timed run CUDA_CHECK(cudaEventRecord(start)); applySeparableGaussianBlur(d_input, d_output, d_temp, d_kernel, width, height, radius); CUDA_CHECK(cudaEventRecord(stop)); CUDA_CHECK(cudaEventSynchronize(stop)); float milliseconds = 0; CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop)); printf("Execution time: %.3f ms\n", milliseconds); printf("Throughput: %.2f Mpixels/sec\n", (width * height / 1e6) / (milliseconds / 1000.0f)); // Copy result back to host CUDA_CHECK(cudaMemcpy(h_output, d_output, imageSize, cudaMemcpyDeviceToHost)); // Verify result (check a few pixels) printf("\nSample output values:\n"); for (int i = 0; i < 5; i++) { int idx = (height / 2) * width + (width / 2) + i; printf("Pixel[%d][%d]: %.6f\n", height / 2, width / 2 + i, h_output[idx]); } // Cleanup CUDA_CHECK(cudaEventDestroy(start)); CUDA_CHECK(cudaEventDestroy(stop)); CUDA_CHECK(cudaFree(d_input)); CUDA_CHECK(cudaFree(d_output)); CUDA_CHECK(cudaFree(d_temp)); CUDA_CHECK(cudaFree(d_kernel)); free(h_input); free(h_output); free(h_kernel); printf("\nGaussian blur completed successfully!\n"); return 0; }