qfgaohao · October 17, 2017 04:13 · Oct 17, 2017 · Oct 11, 2017
diff --git a/bank_conflicts_test.cu b/bank_conflicts_test.cu
@@ -49,6 +49,9 @@ int main() {
         printf("%d ", h_time[i]);
     }
     printf("%s", "\n");
+
+    cudaFree(d_time);
+    free(h_time);
 
     return 0;
 }
diff --git a/bank_conflicts_test.cu b/bank_conflicts_test.cu
@@ -0,0 +1,54 @@
+#include <stdio.h>
+
+#define N (32)
+
+
+__global__ void increment(int* time) {
+    __shared__ float s[1024];
+    for (int i = 0; i < 1024; i++) {
+        s[i] = 1.0f;
+    }
+    __syncthreads();
+
+    for (int i = 0; i < 32; i++) {
+        int start = clock();
+        // enable broadcast by accessing the same element in a bank:
+        // s[threadIdx.x * (i + 1) % 32] += 1.0f;
+        s[threadIdx.x * (i + 1)] += 1.0f;  // stride: i + 1
+        int end = clock();
+        if (threadIdx.x == 0) {
+            time[i] = end - start;
+        }
+    }
+}
+
+
+int main() {
+    int *h_time;
+    int* d_time;
+
+
+    h_time = (int*)malloc(32 * sizeof(int));
+    cudaMalloc(&d_time, N * sizeof(int));
+
+    // setup the kernal
+    increment<<<1, N>>>(d_time);
+    cudaError_t ierrSync = cudaGetLastError();
+    if(ierrSync != cudaSuccess) { 
+        printf("Sync error: %s\n", cudaGetErrorString(ierrSync)); 
+    }
+
+    // run the kernal
+    cudaError_t ierrAsync = cudaDeviceSynchronize();
+    if(ierrAsync != cudaSuccess) {
+        printf("Async error: %s\n", cudaGetErrorString(ierrAsync)); 
+    }
+
+    cudaMemcpy(h_time, d_time, 32 * sizeof(int), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < 32; i++) {
+        printf("%d ", h_time[i]);
+    }
+    printf("%s", "\n");
+
+    return 0;
+}
No results found