Skip to content

Instantly share code, notes, and snippets.

@qfgaohao
Last active October 17, 2017 04:13
Show Gist options
  • Select an option

  • Save qfgaohao/ce5420faef59dd0fc627fc12c54fac4a to your computer and use it in GitHub Desktop.

Select an option

Save qfgaohao/ce5420faef59dd0fc627fc12c54fac4a to your computer and use it in GitHub Desktop.

Revisions

  1. qfgaohao revised this gist Oct 17, 2017. 1 changed file with 3 additions and 0 deletions.
    3 changes: 3 additions & 0 deletions bank_conflicts_test.cu
    Original file line number Diff line number Diff line change
    @@ -49,6 +49,9 @@ int main() {
    printf("%d ", h_time[i]);
    }
    printf("%s", "\n");

    cudaFree(d_time);
    free(h_time);

    return 0;
    }
  2. qfgaohao created this gist Oct 11, 2017.
    54 changes: 54 additions & 0 deletions bank_conflicts_test.cu
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,54 @@
    #include <stdio.h>

    #define N (32)


    __global__ void increment(int* time) {
    __shared__ float s[1024];
    for (int i = 0; i < 1024; i++) {
    s[i] = 1.0f;
    }
    __syncthreads();

    for (int i = 0; i < 32; i++) {
    int start = clock();
    // enable broadcast by accessing the same element in a bank:
    // s[threadIdx.x * (i + 1) % 32] += 1.0f;
    s[threadIdx.x * (i + 1)] += 1.0f; // stride: i + 1
    int end = clock();
    if (threadIdx.x == 0) {
    time[i] = end - start;
    }
    }
    }


    int main() {
    int *h_time;
    int* d_time;


    h_time = (int*)malloc(32 * sizeof(int));
    cudaMalloc(&d_time, N * sizeof(int));

    // setup the kernal
    increment<<<1, N>>>(d_time);
    cudaError_t ierrSync = cudaGetLastError();
    if(ierrSync != cudaSuccess) {
    printf("Sync error: %s\n", cudaGetErrorString(ierrSync));
    }

    // run the kernal
    cudaError_t ierrAsync = cudaDeviceSynchronize();
    if(ierrAsync != cudaSuccess) {
    printf("Async error: %s\n", cudaGetErrorString(ierrAsync));
    }

    cudaMemcpy(h_time, d_time, 32 * sizeof(int), cudaMemcpyDeviceToHost);
    for (int i = 0; i < 32; i++) {
    printf("%d ", h_time[i]);
    }
    printf("%s", "\n");

    return 0;
    }