Skip to content

Instantly share code, notes, and snippets.

@eruffaldi
Forked from allanmac/assert_cuda.c
Last active May 27, 2016 04:02
Show Gist options
  • Select an option

  • Save eruffaldi/f2cf1eee58100f092ec3 to your computer and use it in GitHub Desktop.

Select an option

Save eruffaldi/f2cf1eee58100f092ec3 to your computer and use it in GitHub Desktop.

Revisions

  1. eruffaldi revised this gist Feb 27, 2016. 3 changed files with 79 additions and 10 deletions.
    10 changes: 10 additions & 0 deletions CMakeLists.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,10 @@
    #sudo pip install --upgrade git+https://github.com/dav1dde/glad.git#egg=glad
    #python main.py --api gl=3.3 --generator=c --out-path=GL
    find_package(CUDA)

    find_package(GLFW)
    include_directories(${GLFW_INCLUDE_DIRS})
    include_directories(GL/include)

    cuda_add_executable(main main.c assert_cuda.c interop.c GL/src/glad.c kernel.cu)
    target_link_libraries(main ${GLFW_LIBRARY})
    47 changes: 46 additions & 1 deletion interop.c
    Original file line number Diff line number Diff line change
    @@ -7,7 +7,7 @@
    #include <GLFW/glfw3.h>
    #include <cuda_gl_interop.h>
    #include <stdlib.h>

    #include <stdio.h>
    //
    //
    //
    @@ -60,20 +60,42 @@ pxl_interop_create(const bool multi_gpu, const int fbo_count)
    interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
    interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));

    fprintf(stderr,"creating buffers\n");
    #ifdef Opengl4_5
    // render buffer object w/a color buffer
    glCreateRenderbuffers(fbo_count,interop->rb);

    // frame buffer object
    glCreateFramebuffers(fbo_count,interop->fb);
    #else
    glGenRenderbuffers(fbo_count,interop->rb);
    glGenFramebuffers(fbo_count,interop->fb);
    #endif

    fprintf(stderr,"created buffers\n");

    // attach rbo to fbo
    for (int index=0; index<fbo_count; index++)
    {
    #ifdef Opengl4_5
    glNamedFramebufferRenderbuffer(interop->fb[index],
    GL_COLOR_ATTACHMENT0,
    GL_RENDERBUFFER,
    interop->rb[index]);

    #else
    glBindFramebuffer(GL_FRAMEBUFFER,interop->fb[index]);
    glFramebufferRenderbuffer(GL_FRAMEBUFFER,
    GL_COLOR_ATTACHMENT0,
    GL_RENDERBUFFER,
    interop->rb[index]);
    glBindFramebuffer(GL_FRAMEBUFFER,0);
    #endif
    }
    fprintf(stderr,"bound buffers\n");




    // return it
    return interop;
    @@ -128,8 +150,14 @@ pxl_interop_size_set(struct pxl_interop* const interop, const int width, const i
    if (interop->cgr[index] != NULL)
    cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));

    #ifdef Opengl4_5
    // resize rbo
    glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);
    #else
    glBindRenderbuffer(GL_RENDERBUFFER,interop->rb[index]);
    glRenderbufferStorage(GL_RENDERBUFFER,GL_RGBA8,width,height);
    glBindRenderbuffer(GL_RENDERBUFFER,0);
    #endif

    // probe fbo status
    // glCheckNamedFramebufferStatus(interop->fb[index],0);
    @@ -244,7 +272,13 @@ pxl_interop_clear(struct pxl_interop* const interop)
    */

    const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f };
    #ifdef Opengl4_5
    glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color);
    #else
    glBindFramebuffer(GL_FRAMEBUFFER,interop->fb[interop->index]);
    glClearBufferfv(GL_COLOR,0,clear_color);
    glBindFramebuffer(GL_FRAMEBUFFER,0);
    #endif
    }

    //
    @@ -254,11 +288,22 @@ pxl_interop_clear(struct pxl_interop* const interop)
    void
    pxl_interop_blit(struct pxl_interop* const interop)
    {
    #ifdef Opengl4_5
    glBlitNamedFramebuffer(interop->fb[interop->index],0,
    0,0, interop->width,interop->height,
    0,interop->height,interop->width,0,
    GL_COLOR_BUFFER_BIT,
    GL_NEAREST);
    #else
    glBindFramebuffer(GL_READ_FRAMEBUFFER,interop->fb[interop->index]);
    glBlitFramebuffer(
    0,0, interop->width,interop->height,
    0,interop->height,interop->width,0,
    GL_COLOR_BUFFER_BIT,
    GL_NEAREST);
    glBindFramebuffer(GL_READ_FRAMEBUFFER,0);

    #endif
    }

    //
    32 changes: 23 additions & 9 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -55,7 +55,7 @@ pxl_glfw_fps(GLFWwindow* window)

    glfwGetFramebufferSize(window,&width,&height);

    sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps);
    sprintf(tmp,"(%u x %u) - FPS: %.2f",width,height,fps);

    glfwSetWindowTitle(window,tmp);

    @@ -100,12 +100,17 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)
    glfwWindowHint(GLFW_DEPTH_BITS, 0);
    glfwWindowHint(GLFW_STENCIL_BITS, 0);

    #ifdef Opengl_45
    glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE);

    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);

    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
    #else
    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
    glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
    #endif

    #ifdef PXL_FULLSCREEN
    GLFWmonitor* monitor = glfwGetPrimaryMonitor();
    @@ -175,16 +180,21 @@ main(int argc, char* argv[])

    pxl_glfw_init(&window,1024,1024);


    //
    // INIT CUDA
    //
    cudaError_t cuda_err;

    int gl_device_id,gl_device_count;
    int gl_device_id=0,gl_device_count=0,cuda_device_id=0;
    #ifdef Opengl_45
    cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll));

    int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
    cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
    cuda_err = cuda(SetDevice(cuda_device_id));
    #else
    cudaGLSetGLDevice(0);
    #endif


    //
    // MULTI-GPU?
    @@ -208,14 +218,17 @@ main(int argc, char* argv[])
    cudaStream_t stream;
    cudaEvent_t event;

    fprintf(stderr,"before create\n");
    cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault)); // optionally ignore default stream behavior
    cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming);

    //
    // CREATE INTEROP
    //
    // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
    fprintf(stderr,"pxl_interop_create\n");
    struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2);
    fprintf(stderr,"after pxl_interop_create\n");

    //
    // RESIZE INTEROP
    @@ -236,6 +249,7 @@ main(int argc, char* argv[])
    glfwSetKeyCallback (window,pxl_glfw_key_callback);
    glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);

    fprintf(stderr,"loop\n");
    //
    // LOOP UNTIL DONE
    //
  2. Allan MacKinnon revised this gist Nov 27, 2015. 2 changed files with 3 additions and 10 deletions.
    5 changes: 0 additions & 5 deletions assert_cuda.c
    Original file line number Diff line number Diff line change
    @@ -1,8 +1,3 @@
    /*
    * Copyright 2015 Allan MacKinnon. All rights reserved.
    *
    */

    //
    //
    //
    8 changes: 3 additions & 5 deletions assert_cuda.h
    Original file line number Diff line number Diff line change
    @@ -1,8 +1,6 @@

    /*
    * Copyright 2015 Allan MacKinnon. All rights reserved.
    *
    */
    //
    //
    //

    #pragma once

  3. Allan MacKinnon revised this gist Nov 27, 2015. 2 changed files with 69 additions and 0 deletions.
    42 changes: 42 additions & 0 deletions assert_cuda.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,42 @@
    /*
    * Copyright 2015 Allan MacKinnon. All rights reserved.
    *
    */

    //
    //
    //

    #include <stdlib.h>
    #include <stdio.h>

    //
    //
    //

    #include "assert_cuda.h"

    //
    //
    //

    cudaError_t
    cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
    {
    if (code != cudaSuccess)
    {
    fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);

    if (abort)
    {
    cudaDeviceReset();
    exit(code);
    }
    }

    return code;
    }

    //
    //
    //
    27 changes: 27 additions & 0 deletions assert_cuda.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,27 @@

    /*
    * Copyright 2015 Allan MacKinnon. All rights reserved.
    *
    */

    #pragma once

    //
    //
    //

    #include <cuda_runtime.h>
    #include <stdbool.h>

    //
    // Beware that NVCC doesn't work with C files and __VA_ARGS__
    //

    cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort);

    #define cuda(...) cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);

    //
    //
    //

  4. Allan MacKinnon revised this gist Nov 27, 2015. 3 changed files with 51 additions and 61 deletions.
    35 changes: 18 additions & 17 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -12,6 +12,7 @@
    //
    //

    #include "assert_cuda.h"
    #include "interop.h"

    //
    @@ -88,7 +89,7 @@ pxl_interop_destroy(struct pxl_interop* const interop)
    for (int index=0; index<interop->count; index++)
    {
    if (interop->cgr[index] != NULL)
    cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);
    cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
    }

    // delete rbo's
    @@ -125,7 +126,7 @@ pxl_interop_size_set(struct pxl_interop* const interop, const int width, const i
    {
    // unregister resource
    if (interop->cgr[index] != NULL)
    cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);
    cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));

    // resize rbo
    glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);
    @@ -134,26 +135,26 @@ pxl_interop_size_set(struct pxl_interop* const interop, const int width, const i
    // glCheckNamedFramebufferStatus(interop->fb[index],0);

    // register rbo
    cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr[index],
    interop->rb[index],
    GL_RENDERBUFFER,
    cudaGraphicsRegisterFlagsSurfaceLoadStore |
    cudaGraphicsRegisterFlagsWriteDiscard);
    cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index],
    interop->rb[index],
    GL_RENDERBUFFER,
    cudaGraphicsRegisterFlagsSurfaceLoadStore |
    cudaGraphicsRegisterFlagsWriteDiscard));
    }

    // map graphics resources
    cuda_err = cudaGraphicsMapResources(interop->count,interop->cgr,0);
    cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0));

    // get CUDA Array refernces
    for (int index=0; index<interop->count; index++)
    {
    cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[index],
    interop->cgr[index],
    0,0);
    cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index],
    interop->cgr[index],
    0,0));
    }

    // unmap graphics resources
    cuda_err = cudaGraphicsUnmapResources(interop->count,interop->cgr,0);
    cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0));

    return cuda_err;
    }
    @@ -176,7 +177,7 @@ pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
    return cudaSuccess;

    // map graphics resources
    return cudaGraphicsMapResources(1,&interop->cgr[interop->index],stream);
    return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream));
    }

    cudaError_t
    @@ -185,7 +186,7 @@ pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
    if (!interop->multi_gpu)
    return cudaSuccess;

    return cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],stream);
    return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream));
    }

    cudaError_t
    @@ -198,9 +199,9 @@ pxl_interop_array_map(struct pxl_interop* const interop)
    cudaError_t cuda_err;

    // get a CUDA Array
    cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
    interop->cgr[interop->index],
    0,0);
    cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
    interop->cgr[interop->index],
    0,0));
    return cuda_err;
    }

    26 changes: 22 additions & 4 deletions kernel.cu
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,24 @@
    // -*- compile-command: "nvcc -m 32 -arch sm_30 -Xptxas=-v -cubin kernel.cu"; -*-
    // -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*-

    //
    //
    //

    #ifdef __cplusplus
    extern "C" {
    #endif

    #include "assert_cuda.h"

    #ifdef __cplusplus
    }
    #endif

    //
    //
    //

    #define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor

    //
    //
    @@ -69,8 +89,6 @@ pxl_kernel(const int width, const int height)
    //
    //

    #define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor

    extern "C"
    cudaError_t
    pxl_kernel_launcher(cudaArray_const_t array,
    @@ -83,7 +101,7 @@ pxl_kernel_launcher(cudaArray_const_t array,

    // cuda_err = cudaEventRecord(event,stream);

    cuda_err = cudaBindSurfaceToArray(surf,array);
    cuda_err = cuda(BindSurfaceToArray(surf,array));

    if (cuda_err)
    return cuda_err;
    51 changes: 11 additions & 40 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -23,6 +23,7 @@
    //
    //

    #include "assert_cuda.h"
    #include "interop.h"

    //
    @@ -170,39 +171,35 @@ main(int argc, char* argv[])
    //
    // INIT GLFW
    //

    GLFWwindow* window;

    pxl_glfw_init(&window,1024,1024);

    //
    // INIT CUDA
    //

    cudaError_t cuda_err;

    int gl_device_id,gl_device_count;
    cuda_err = cudaGLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll);
    cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll));

    int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
    cuda_err = cudaSetDevice(cuda_device_id);
    cuda_err = cuda(SetDevice(cuda_device_id));

    //
    // MULTI-GPU?
    //

    const bool multi_gpu = gl_device_id != cuda_device_id;

    //
    // INFO
    //

    struct cudaDeviceProp props;

    cuda_err = cudaGetDeviceProperties(&props,gl_device_id);
    cuda_err = cuda(GetDeviceProperties(&props,gl_device_id));
    printf("GL : %-24s (%2d)\n",props.name,props.multiProcessorCount);

    cuda_err = cudaGetDeviceProperties(&props,cuda_device_id);
    cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id));
    printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);

    //
    @@ -211,32 +208,14 @@ main(int argc, char* argv[])
    cudaStream_t stream;
    cudaEvent_t event;

    cuda_err = cudaStreamCreateWithFlags(&stream,cudaStreamDefault); // optionally ignore default stream behavior
    cuda_err = cudaEventCreateWithFlags(&event,cudaEventBlockingSync); // | cudaEventDisableTiming);

    //
    // UPDATE DEVICE LIMITS
    //

    // #ifdef _DEBUG
    // check printf() FIFO limit
    size_t fifo;

    cuda_err = cudaDeviceGetLimit(&fifo,cudaLimitPrintfFifoSize);

    printf("fifo = %Iu\n",fifo);

    cuda_err = cudaDeviceSetLimit(cudaLimitPrintfFifoSize,fifo*16);
    cuda_err = cudaDeviceGetLimit(&fifo,cudaLimitPrintfFifoSize);

    printf("fifo = %Iu\n",fifo);
    // #endif
    cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault)); // optionally ignore default stream behavior
    cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming);

    //
    // CREATE INTEROP
    //

    struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2); // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
    // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
    struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2);

    //
    // RESIZE INTEROP
    @@ -253,27 +232,23 @@ main(int argc, char* argv[])
    //
    // SET USER POINTER AND CALLBACKS
    //

    glfwSetWindowUserPointer (window,interop);
    glfwSetKeyCallback (window,pxl_glfw_key_callback);
    glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);

    //
    // LOOP UNTIL DONE
    //

    while (!glfwWindowShouldClose(window))
    {
    //
    // MONITOR FPS
    //

    pxl_glfw_fps(window);

    //
    // EXECUTE CUDA KERNEL ON RENDER BUFFER
    //

    int width,height;
    cudaArray_t cuda_array;

    @@ -292,38 +267,34 @@ main(int argc, char* argv[])
    //
    // BLIT & SWAP FBO
    //

    pxl_interop_blit(interop);
    // pxl_interop_clear(interop);
    pxl_interop_swap(interop);

    //
    // SWAP WINDOW
    //

    glfwSwapBuffers(window);

    //
    // PUMP/POLL/WAIT
    //

    glfwPollEvents(); // glfwWaitEvents();
    }

    //
    // CLEANUP
    //

    pxl_interop_destroy(interop);

    glfwDestroyWindow(window);

    glfwTerminate();

    cudaDeviceReset();
    cuda(DeviceReset());

    // missing some clean up here

    exit(EXIT_SUCCESS);
    }

  5. Allan MacKinnon revised this gist Nov 17, 2015. 1 changed file with 18 additions and 2 deletions.
    20 changes: 18 additions & 2 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -182,11 +182,9 @@ main(int argc, char* argv[])
    cudaError_t cuda_err;

    int gl_device_id,gl_device_count;

    cuda_err = cudaGLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll);

    int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;

    cuda_err = cudaSetDevice(cuda_device_id);

    //
    @@ -216,6 +214,24 @@ main(int argc, char* argv[])
    cuda_err = cudaStreamCreateWithFlags(&stream,cudaStreamDefault); // optionally ignore default stream behavior
    cuda_err = cudaEventCreateWithFlags(&event,cudaEventBlockingSync); // | cudaEventDisableTiming);

    //
    // UPDATE DEVICE LIMITS
    //

    // #ifdef _DEBUG
    // check printf() FIFO limit
    size_t fifo;

    cuda_err = cudaDeviceGetLimit(&fifo,cudaLimitPrintfFifoSize);

    printf("fifo = %Iu\n",fifo);

    cuda_err = cudaDeviceSetLimit(cudaLimitPrintfFifoSize,fifo*16);
    cuda_err = cudaDeviceGetLimit(&fifo,cudaLimitPrintfFifoSize);

    printf("fifo = %Iu\n",fifo);
    // #endif

    //
    // CREATE INTEROP
    //
  6. Allan MacKinnon revised this gist Jul 23, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion main.c
    Original file line number Diff line number Diff line change
    @@ -220,7 +220,7 @@ main(int argc, char* argv[])
    // CREATE INTEROP
    //

    struct pxl_interop* const interop = pxl_interop_create(false /*multi_gpu*/,2); // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
    struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2); // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE

    //
    // RESIZE INTEROP
  7. Allan MacKinnon revised this gist May 16, 2015. 3 changed files with 32 additions and 14 deletions.
    2 changes: 1 addition & 1 deletion interop.c
    Original file line number Diff line number Diff line change
    @@ -49,7 +49,7 @@ pxl_interop_create(const bool multi_gpu, const int fbo_count)
    {
    struct pxl_interop* const interop = calloc(1,sizeof(*interop));

    interop->multi_gpu = true; // multi_gpu;
    interop->multi_gpu = multi_gpu;
    interop->count = fbo_count;
    interop->index = 0;

    18 changes: 13 additions & 5 deletions kernel.cu
    Original file line number Diff line number Diff line change
    @@ -29,7 +29,7 @@ union pxl_rgbx_24
    extern "C"
    __global__
    void
    pxl_kernel(const int width, const int height, const int index)
    pxl_kernel(const int width, const int height)
    {
    // pixel coordinates
    const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
    @@ -54,7 +54,7 @@ pxl_kernel(const int width, const int height, const int index)
    #else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT
    const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
    union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : index==0 ? 0xFF000000 : 0xFFFFFFFF };
    union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 };
    #endif

    @@ -76,19 +76,27 @@ cudaError_t
    pxl_kernel_launcher(cudaArray_const_t array,
    const int width,
    const int height,
    const int index,
    cudaEvent_t event,
    cudaStream_t stream)
    {
    cudaError_t cuda_err = cudaBindSurfaceToArray(surf,array);
    cudaError_t cuda_err;

    // cuda_err = cudaEventRecord(event,stream);

    cuda_err = cudaBindSurfaceToArray(surf,array);

    if (cuda_err)
    return cuda_err;

    const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;

    // cuda_err = cudaEventRecord(event,stream);

    if (blocks > 0)
    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height,index);
    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);

    // cuda_err = cudaStreamWaitEvent(stream,event,0);

    return cudaSuccess;
    }

    26 changes: 18 additions & 8 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -157,7 +157,7 @@ cudaError_t
    pxl_kernel_launcher(cudaArray_const_t array,
    const int width,
    const int height,
    const int index,
    cudaEvent_t event,
    cudaStream_t stream);

    //
    @@ -207,11 +207,20 @@ main(int argc, char* argv[])
    cuda_err = cudaGetDeviceProperties(&props,cuda_device_id);
    printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);

    //
    // CREATE CUDA STREAM & EVENT
    //
    cudaStream_t stream;
    cudaEvent_t event;

    cuda_err = cudaStreamCreateWithFlags(&stream,cudaStreamDefault); // optionally ignore default stream behavior
    cuda_err = cudaEventCreateWithFlags(&event,cudaEventBlockingSync); // | cudaEventDisableTiming);

    //
    // CREATE INTEROP
    //

    struct pxl_interop* const interop = pxl_interop_create(multi_gpu,2);
    struct pxl_interop* const interop = pxl_interop_create(false /*multi_gpu*/,2); // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE

    //
    // RESIZE INTEROP
    @@ -254,21 +263,22 @@ main(int argc, char* argv[])

    pxl_interop_size_get(interop,&width,&height);

    cuda_err = pxl_interop_map(interop,0);
    cuda_err = pxl_interop_map(interop,stream);

    cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
    width,height,
    pxl_interop_index_get(interop),
    0);
    width,
    height,
    event,
    stream);

    cuda_err = pxl_interop_unmap(interop,0);
    cuda_err = pxl_interop_unmap(interop,stream);

    //
    // BLIT & SWAP FBO
    //

    pxl_interop_blit(interop);
    pxl_interop_clear(interop);
    // pxl_interop_clear(interop);
    pxl_interop_swap(interop);

    //
  8. Allan MacKinnon revised this gist May 14, 2015. 2 changed files with 6 additions and 9 deletions.
    14 changes: 5 additions & 9 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -49,7 +49,7 @@ pxl_interop_create(const bool multi_gpu, const int fbo_count)
    {
    struct pxl_interop* const interop = calloc(1,sizeof(*interop));

    interop->multi_gpu = multi_gpu;
    interop->multi_gpu = true; // multi_gpu;
    interop->count = fbo_count;
    interop->index = 0;

    @@ -238,16 +238,12 @@ void
    pxl_interop_clear(struct pxl_interop* const interop)
    {
    /*
    const GLenum draw_buffer[] = { GL_COLOR_ATTACHMENT0 };
    const GLuint clear_color[] = { 255, 0, 0, 255 };
    glNamedFramebufferDrawBuffers(interop->fb0,1,draw_buffer);
    glClearNamedFramebufferuiv(interop->fb0,GL_COLOR,0,clear_color);
    */

    static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };

    glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments);
    */

    const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f };
    glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color);
    }

    //
    1 change: 1 addition & 0 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -268,6 +268,7 @@ main(int argc, char* argv[])
    //

    pxl_interop_blit(interop);
    pxl_interop_clear(interop);
    pxl_interop_swap(interop);

    //
  9. Allan MacKinnon revised this gist May 13, 2015. 1 changed file with 0 additions and 2 deletions.
    2 changes: 0 additions & 2 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -47,8 +47,6 @@ struct pxl_interop
    struct pxl_interop*
    pxl_interop_create(const bool multi_gpu, const int fbo_count)
    {
    cudaError_t cuda_err;

    struct pxl_interop* const interop = calloc(1,sizeof(*interop));

    interop->multi_gpu = multi_gpu;
  10. Allan MacKinnon revised this gist May 13, 2015. 2 changed files with 4 additions and 2 deletions.
    4 changes: 4 additions & 0 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -193,6 +193,10 @@ pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
    cudaError_t
    pxl_interop_array_map(struct pxl_interop* const interop)
    {
    //
    // FIXME -- IS THIS EVEN NEEDED?
    //

    cudaError_t cuda_err;

    // get a CUDA Array
    2 changes: 0 additions & 2 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -256,8 +256,6 @@ main(int argc, char* argv[])

    cuda_err = pxl_interop_map(interop,0);

    // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED ANYMORE?

    cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
    width,height,
    pxl_interop_index_get(interop),
  11. Allan MacKinnon revised this gist May 13, 2015. 3 changed files with 22 additions and 26 deletions.
    28 changes: 15 additions & 13 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -20,6 +20,9 @@

    struct pxl_interop
    {
    // split GPUs?
    bool multi_gpu;

    // number of fbo's
    int count;
    int index;
    @@ -42,14 +45,15 @@ struct pxl_interop
    //

    struct pxl_interop*
    pxl_interop_create(const int fbo_count)
    pxl_interop_create(const bool multi_gpu, const int fbo_count)
    {
    cudaError_t cuda_err;

    struct pxl_interop* const interop = calloc(1,sizeof(*interop));

    interop->count = fbo_count;
    interop->index = 0;
    interop->multi_gpu = multi_gpu;
    interop->count = fbo_count;
    interop->index = 0;

    // allocate arrays
    interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
    @@ -170,22 +174,20 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
    {
    cudaError_t cuda_err;

    if (!interop->multi_gpu)
    return cudaSuccess;

    // map graphics resources
    cuda_err = cudaGraphicsMapResources(1,&interop->cgr[interop->index],stream);

    return cuda_err;
    return cudaGraphicsMapResources(1,&interop->cgr[interop->index],stream);
    }

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
    {
    cudaError_t cuda_err;

    cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],stream);

    return cuda_err;
    if (!interop->multi_gpu)
    return cudaSuccess;

    return cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],stream);
    }

    cudaError_t
    3 changes: 2 additions & 1 deletion interop.h
    Original file line number Diff line number Diff line change
    @@ -10,13 +10,14 @@
    //

    #include <cuda_runtime.h>
    #include <stdbool.h>

    //
    //
    //

    struct pxl_interop*
    pxl_interop_create(const int fbo_count);
    pxl_interop_create(const bool multi_gpu, const int fbo_count);

    void
    pxl_interop_destroy(struct pxl_interop* const interop);
    17 changes: 5 additions & 12 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -211,7 +211,7 @@ main(int argc, char* argv[])
    // CREATE INTEROP
    //

    struct pxl_interop* const interop = pxl_interop_create(2);
    struct pxl_interop* const interop = pxl_interop_create(multi_gpu,2);

    //
    // RESIZE INTEROP
    @@ -237,8 +237,6 @@ main(int argc, char* argv[])
    // LOOP UNTIL DONE
    //

    int step = 0;

    while (!glfwWindowShouldClose(window))
    {
    //
    @@ -256,21 +254,16 @@ main(int argc, char* argv[])

    pxl_interop_size_get(interop,&width,&height);

    if (multi_gpu)
    {
    cuda_err = pxl_interop_map(interop,0);
    // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED?
    }
    cuda_err = pxl_interop_map(interop,0);

    // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED ANYMORE?

    cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
    width,height,
    pxl_interop_index_get(interop),
    0);

    if (multi_gpu)
    {
    cuda_err = pxl_interop_unmap(interop,0);
    }
    cuda_err = pxl_interop_unmap(interop,0);

    //
    // BLIT & SWAP FBO
  12. Allan MacKinnon revised this gist May 13, 2015. 3 changed files with 14 additions and 33 deletions.
    33 changes: 8 additions & 25 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -35,9 +35,6 @@ struct pxl_interop
    // CUDA resources
    cudaGraphicsResource_t* cgr;
    cudaArray_t* ca;

    // CUDA streams
    cudaStream_t* stream;
    };

    //
    @@ -55,11 +52,10 @@ pxl_interop_create(const int fbo_count)
    interop->index = 0;

    // allocate arrays
    interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
    interop->rb = calloc(fbo_count,sizeof(*(interop->rb )));
    interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
    interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));
    interop->stream = calloc(fbo_count,sizeof(*(interop->stream)));
    interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
    interop->rb = calloc(fbo_count,sizeof(*(interop->rb )));
    interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
    interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));

    // render buffer object w/a color buffer
    glCreateRenderbuffers(fbo_count,interop->rb);
    @@ -74,8 +70,6 @@ pxl_interop_create(const int fbo_count)
    GL_COLOR_ATTACHMENT0,
    GL_RENDERBUFFER,
    interop->rb[index]);

    cuda_err = cudaStreamCreate(&interop->stream[index]);
    }

    // return it
    @@ -93,8 +87,6 @@ pxl_interop_destroy(struct pxl_interop* const interop)
    {
    if (interop->cgr[index] != NULL)
    cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);

    cuda_err = cudaStreamDestroy(interop->stream[index]);
    }

    // delete rbo's
    @@ -108,7 +100,6 @@ pxl_interop_destroy(struct pxl_interop* const interop)
    free(interop->rb);
    free(interop->cgr);
    free(interop->ca);
    free(interop->stream);

    // free interop
    free(interop);
    @@ -177,24 +168,22 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop)
    pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
    {
    cudaError_t cuda_err;

    // map graphics resources
    cuda_err = cudaGraphicsMapResources(1,&interop->cgr[interop->index],
    interop->stream[interop->index]);
    cuda_err = cudaGraphicsMapResources(1,&interop->cgr[interop->index],stream);

    return cuda_err;
    }

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop)
    pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
    {
    cudaError_t cuda_err;

    cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],
    interop->stream[interop->index]);
    cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],stream);

    return cuda_err;
    }
    @@ -221,12 +210,6 @@ pxl_interop_array_get(struct pxl_interop* const interop)
    return interop->ca[interop->index];
    }

    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop)
    {
    return interop->stream[interop->index];
    }

    int
    pxl_interop_index_get(struct pxl_interop* const interop)
    {
    6 changes: 3 additions & 3 deletions interop.h
    Original file line number Diff line number Diff line change
    @@ -36,13 +36,13 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop);
    pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);

    cudaError_t
    pxl_interop_array_map(struct pxl_interop* const interop);
    pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop);
    pxl_interop_array_map(struct pxl_interop* const interop);

    //
    //
    8 changes: 3 additions & 5 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -258,27 +258,25 @@ main(int argc, char* argv[])

    if (multi_gpu)
    {
    cuda_err = pxl_interop_map(interop);
    cuda_err = pxl_interop_map(interop,0);
    // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED?
    }

    cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
    width,height,
    pxl_interop_index_get(interop),
    pxl_interop_stream_get(interop));
    0);

    if (multi_gpu)
    {
    cuda_err = pxl_interop_unmap(interop);
    // cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop)); // NOT NEEDED?
    cuda_err = pxl_interop_unmap(interop,0);
    }

    //
    // BLIT & SWAP FBO
    //

    pxl_interop_blit(interop);

    pxl_interop_swap(interop);

    //
  13. Allan MacKinnon revised this gist May 13, 2015. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions kernel.cu
    Original file line number Diff line number Diff line change
    @@ -37,6 +37,7 @@ pxl_kernel(const int width, const int height, const int index)
    const int y = idx / width;

    #if 1

    // pixel color
    const int t = (unsigned int)clock() / 1100000; // 1.1 GHz
    const int xt = (idx + t) % width;
  14. Allan MacKinnon revised this gist May 13, 2015. 4 changed files with 30 additions and 25 deletions.
    22 changes: 12 additions & 10 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -176,16 +176,6 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
    //
    //

    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop)
    {
    return interop->stream[interop->index];
    }

    //
    //
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop)
    {
    @@ -231,6 +221,18 @@ pxl_interop_array_get(struct pxl_interop* const interop)
    return interop->ca[interop->index];
    }

    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop)
    {
    return interop->stream[interop->index];
    }

    int
    pxl_interop_index_get(struct pxl_interop* const interop)
    {
    return interop->index;
    }

    //
    //
    //
    13 changes: 6 additions & 7 deletions interop.h
    Original file line number Diff line number Diff line change
    @@ -35,13 +35,6 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
    //
    //

    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop);

    //
    //
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop);

    @@ -58,6 +51,12 @@ pxl_interop_unmap(struct pxl_interop* const interop);
    cudaArray_const_t
    pxl_interop_array_get(struct pxl_interop* const interop);

    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop);

    int
    pxl_interop_index_get(struct pxl_interop* const interop);

    //
    //
    //
    9 changes: 5 additions & 4 deletions kernel.cu
    Original file line number Diff line number Diff line change
    @@ -29,7 +29,7 @@ union pxl_rgbx_24
    extern "C"
    __global__
    void
    pxl_kernel(const int width, const int height)
    pxl_kernel(const int width, const int height, const int index)
    {
    // pixel coordinates
    const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
    @@ -38,7 +38,7 @@ pxl_kernel(const int width, const int height)

    #if 1
    // pixel color
    const int t = (unsigned int)clock() / 1250000; // 1.25 GHz
    const int t = (unsigned int)clock() / 1100000; // 1.1 GHz
    const int xt = (idx + t) % width;
    const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
    const unsigned int bar = ((y + t) / 32) & 3;
    @@ -53,7 +53,7 @@ pxl_kernel(const int width, const int height)
    #else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT
    const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
    union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 };
    union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : index==0 ? 0xFF000000 : 0xFFFFFFFF };
    #endif

    @@ -75,6 +75,7 @@ cudaError_t
    pxl_kernel_launcher(cudaArray_const_t array,
    const int width,
    const int height,
    const int index,
    cudaStream_t stream)
    {
    cudaError_t cuda_err = cudaBindSurfaceToArray(surf,array);
    @@ -85,7 +86,7 @@ pxl_kernel_launcher(cudaArray_const_t array,
    const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;

    if (blocks > 0)
    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);
    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height,index);

    return cudaSuccess;
    }
    11 changes: 7 additions & 4 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -157,6 +157,7 @@ cudaError_t
    pxl_kernel_launcher(cudaArray_const_t array,
    const int width,
    const int height,
    const int index,
    cudaStream_t stream);

    //
    @@ -258,28 +259,30 @@ main(int argc, char* argv[])
    if (multi_gpu)
    {
    cuda_err = pxl_interop_map(interop);
    // cuda_err = pxl_interop_array_map(interop);
    // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED?
    }

    cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
    width,height,
    pxl_interop_index_get(interop),
    pxl_interop_stream_get(interop));

    if (multi_gpu)
    {
    cuda_err = pxl_interop_unmap(interop);
    // cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop));
    // cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop)); // NOT NEEDED?
    }

    //
    // BLIT
    // BLIT & SWAP FBO
    //

    pxl_interop_blit(interop);

    pxl_interop_swap(interop);

    //
    // SWAP
    // SWAP WINDOW
    //

    glfwSwapBuffers(window);
  15. Allan MacKinnon revised this gist May 13, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion main.c
    Original file line number Diff line number Diff line change
    @@ -268,7 +268,7 @@ main(int argc, char* argv[])
    if (multi_gpu)
    {
    cuda_err = pxl_interop_unmap(interop);
    cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop));
    // cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop));
    }

    //
  16. Allan MacKinnon revised this gist May 13, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion main.c
    Original file line number Diff line number Diff line change
    @@ -258,7 +258,7 @@ main(int argc, char* argv[])
    if (multi_gpu)
    {
    cuda_err = pxl_interop_map(interop);
    cuda_err = pxl_interop_array_map(interop);
    // cuda_err = pxl_interop_array_map(interop);
    }

    cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
  17. Allan MacKinnon revised this gist May 13, 2015. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -20,6 +20,7 @@

    struct pxl_interop
    {
    // number of fbo's
    int count;
    int index;

  18. Allan MacKinnon revised this gist May 13, 2015. 3 changed files with 132 additions and 45 deletions.
    118 changes: 86 additions & 32 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -35,8 +35,8 @@ struct pxl_interop
    cudaGraphicsResource_t* cgr;
    cudaArray_t* ca;

    // CUDA stream
    cudaStream_t stream;
    // CUDA streams
    cudaStream_t* stream;
    };

    //
    @@ -46,33 +46,36 @@ struct pxl_interop
    struct pxl_interop*
    pxl_interop_create(const int fbo_count)
    {
    cudaError_t cuda_err;

    struct pxl_interop* const interop = calloc(1,sizeof(*interop));

    interop->count = fbo_count;
    interop->index = 0;

    // allocate arrays
    interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
    interop->rb = calloc(fbo_count,sizeof(*(interop->rb )));
    interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
    interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));
    interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
    interop->rb = calloc(fbo_count,sizeof(*(interop->rb )));
    interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
    interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));
    interop->stream = calloc(fbo_count,sizeof(*(interop->stream)));

    // render buffer object w/a color buffer
    glCreateRenderbuffers(fbo_count,interop->rb);

    // frame buffer object
    glCreateFramebuffers(fbo_count,interop->fb);

    // attach rb0 to the fb0
    // attach rbo to fbo
    for (int index=0; index<fbo_count; index++)
    {
    glNamedFramebufferRenderbuffer(interop->fb[index],
    GL_COLOR_ATTACHMENT0,
    GL_RENDERBUFFER,
    interop->rb[index]);
    }

    // create an interop stream
    cudaError_t cuda_err = cudaStreamCreate(&interop->stream);
    cuda_err = cudaStreamCreate(&interop->stream[index]);
    }

    // return it
    return interop;
    @@ -82,23 +85,30 @@ pxl_interop_create(const int fbo_count)
    void
    pxl_interop_destroy(struct pxl_interop* const interop)
    {
    // resize color buffer
    cudaError_t cuda_err;

    // unregister CUDA resources
    for (int index=0; index<interop->count; index++)
    {
    // unregister CUDA resource
    if (interop->cgr[index] != NULL)
    cudaGraphicsUnregisterResource(interop->cgr[index]);
    }
    cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);

    // destroy stream
    cudaStreamDestroy(interop->stream);
    cuda_err = cudaStreamDestroy(interop->stream[index]);
    }

    // delete rbo's
    glDeleteRenderbuffers(interop->count,interop->rb);

    // delete fbo's
    glDeleteFramebuffers(interop->count,interop->fb);

    // free buffers and resources
    free(interop->fb);
    free(interop->rb);
    free(interop->cgr);
    free(interop->ca);
    free(interop->stream);

    // free interop
    free(interop);
    }
    @@ -126,29 +136,30 @@ pxl_interop_size_set(struct pxl_interop* const interop, const int width, const i
    // resize rbo
    glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);

    // cudaGraphicsResource_t cgr;
    // probe fbo status
    // glCheckNamedFramebufferStatus(interop->fb[index],0);

    // register rbo
    cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr[index],
    interop->rb[index],
    GL_RENDERBUFFER,
    cudaGraphicsRegisterFlagsSurfaceLoadStore |
    cudaGraphicsRegisterFlagsWriteDiscard);
    }

    // map graphics resource
    cuda_err = cudaGraphicsMapResources(1,&interop->cgr[index],interop->stream);
    // map graphics resources
    cuda_err = cudaGraphicsMapResources(interop->count,interop->cgr,0);

    // get a CUDA Array
    // get CUDA Array refernces
    for (int index=0; index<interop->count; index++)
    {
    cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[index],
    interop->cgr[index],
    0,0);

    // unmap graphics resource
    cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[index],interop->stream);

    // probe fbo status
    glCheckNamedFramebufferStatus(interop->fb[index],0);
    }

    // unmap graphics resources
    cuda_err = cudaGraphicsUnmapResources(interop->count,interop->cgr,0);

    return cuda_err;
    }
    @@ -164,16 +175,59 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
    //
    //

    cudaArray_const_t
    pxl_interop_array_get(struct pxl_interop* const interop)
    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop)
    {
    return interop->ca[interop->index];
    return interop->stream[interop->index];
    }

    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop)
    //
    //
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop)
    {
    return interop->stream;
    cudaError_t cuda_err;

    // map graphics resources
    cuda_err = cudaGraphicsMapResources(1,&interop->cgr[interop->index],
    interop->stream[interop->index]);

    return cuda_err;
    }

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop)
    {
    cudaError_t cuda_err;

    cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],
    interop->stream[interop->index]);

    return cuda_err;
    }

    cudaError_t
    pxl_interop_array_map(struct pxl_interop* const interop)
    {
    cudaError_t cuda_err;

    // get a CUDA Array
    cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
    interop->cgr[interop->index],
    0,0);
    return cuda_err;
    }

    //
    //
    //

    cudaArray_const_t
    pxl_interop_array_get(struct pxl_interop* const interop)
    {
    return interop->ca[interop->index];
    }

    //
    17 changes: 17 additions & 0 deletions interop.h
    Original file line number Diff line number Diff line change
    @@ -38,6 +38,23 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop);

    //
    //
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop);

    cudaError_t
    pxl_interop_array_map(struct pxl_interop* const interop);

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop);

    //
    //
    //

    cudaArray_const_t
    pxl_interop_array_get(struct pxl_interop* const interop);

    42 changes: 29 additions & 13 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -11,6 +11,7 @@

    #include <stdlib.h>
    #include <stdio.h>
    #include <stdbool.h>

    //
    //
    @@ -153,7 +154,10 @@ pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
    //

    cudaError_t
    pxl_kernel_launcher(cudaArray_const_t array, const int width, const int height, cudaStream_t stream);
    pxl_kernel_launcher(cudaArray_const_t array,
    const int width,
    const int height,
    cudaStream_t stream);

    //
    //
    @@ -173,6 +177,7 @@ main(int argc, char* argv[])
    //
    // INIT CUDA
    //

    cudaError_t cuda_err;

    int gl_device_id,gl_device_count;
    @@ -183,9 +188,16 @@ main(int argc, char* argv[])

    cuda_err = cudaSetDevice(cuda_device_id);

    //
    // MULTI-GPU?
    //

    const bool multi_gpu = gl_device_id != cuda_device_id;

    //
    // INFO
    //

    struct cudaDeviceProp props;

    cuda_err = cudaGetDeviceProperties(&props,gl_device_id);
    @@ -194,14 +206,6 @@ main(int argc, char* argv[])
    cuda_err = cudaGetDeviceProperties(&props,cuda_device_id);
    printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);

    //
    // CREATE A CUDA STREAM
    //

    cudaStream_t stream;

    cuda_err = cudaStreamCreate(&stream);

    //
    // CREATE INTEROP
    //
    @@ -231,6 +235,8 @@ main(int argc, char* argv[])
    //
    // LOOP UNTIL DONE
    //

    int step = 0;

    while (!glfwWindowShouldClose(window))
    {
    @@ -244,27 +250,37 @@ main(int argc, char* argv[])
    // EXECUTE CUDA KERNEL ON RENDER BUFFER
    //

    int width,height;
    int width,height;
    cudaArray_t cuda_array;

    pxl_interop_size_get(interop,&width,&height);

    if (multi_gpu)
    {
    cuda_err = pxl_interop_map(interop);
    cuda_err = pxl_interop_array_map(interop);
    }

    cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
    width,height,
    pxl_interop_stream_get(interop));

    // cuda_err = cudaStreamSynchronize(stream);
    if (multi_gpu)
    {
    cuda_err = pxl_interop_unmap(interop);
    cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop));
    }

    //
    // BLIT
    //

    pxl_interop_blit(interop);
    pxl_interop_swap(interop);

    //
    // SWAP
    //

    pxl_interop_swap(interop);

    glfwSwapBuffers(window);

  19. Allan MacKinnon revised this gist May 13, 2015. 3 changed files with 122 additions and 92 deletions.
    169 changes: 101 additions & 68 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -20,38 +20,59 @@

    struct pxl_interop
    {
    int count;
    int index;

    // w x h
    int width;
    int height;
    int width;
    int height;

    // GL buffers
    GLuint fb0;
    GLuint rb0;
    GLuint* fb;
    GLuint* rb;

    // CUDA resources
    cudaGraphicsResource_t cgr0;
    cudaGraphicsResource_t* cgr;
    cudaArray_t* ca;

    // CUDA stream
    cudaStream_t stream;
    };

    //
    //
    //

    struct pxl_interop*
    pxl_interop_create()
    pxl_interop_create(const int fbo_count)
    {
    struct pxl_interop* const interop = (struct pxl_interop*)malloc(sizeof(struct pxl_interop));
    struct pxl_interop* const interop = calloc(1,sizeof(*interop));

    interop->count = fbo_count;

    // init cuda graphics resource
    interop->cgr0 = NULL;
    // allocate arrays
    interop->fb = calloc(fbo_count,sizeof(*(interop->fb )));
    interop->rb = calloc(fbo_count,sizeof(*(interop->rb )));
    interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
    interop->ca = calloc(fbo_count,sizeof(*(interop->ca)));

    // render buffer object w/a color buffer
    glCreateRenderbuffers(1,&interop->rb0);
    glCreateRenderbuffers(fbo_count,interop->rb);

    // frame buffer object
    glCreateFramebuffers(1,&interop->fb0);
    glCreateFramebuffers(fbo_count,interop->fb);

    // attach rb0 to the fb0
    glNamedFramebufferRenderbuffer(interop->fb0,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);
    for (int index=0; index<fbo_count; index++)
    {
    glNamedFramebufferRenderbuffer(interop->fb[index],
    GL_COLOR_ATTACHMENT0,
    GL_RENDERBUFFER,
    interop->rb[index]);
    }

    // create an interop stream
    cudaError_t cuda_err = cudaStreamCreate(&interop->stream);

    // return it
    return interop;
    @@ -61,15 +82,22 @@ pxl_interop_create()
    void
    pxl_interop_destroy(struct pxl_interop* const interop)
    {
    // unregister CUDA resource
    if (interop->cgr0 != NULL)
    cudaGraphicsUnregisterResource(interop->cgr0);
    // resize color buffer
    for (int index=0; index<interop->count; index++)
    {
    // unregister CUDA resource
    if (interop->cgr[index] != NULL)
    cudaGraphicsUnregisterResource(interop->cgr[index]);
    }

    // render buffer object w/a color buffer
    glDeleteRenderbuffers(1,&interop->rb0);
    // destroy stream
    cudaStreamDestroy(interop->stream);

    // frame buffer object
    glDeleteFramebuffers(1,&interop->fb0);
    // delete rbo's
    glDeleteRenderbuffers(interop->count,interop->rb);

    // delete fbo's
    glDeleteFramebuffers(interop->count,interop->fb);

    // free interop
    free(interop);
    @@ -80,39 +108,53 @@ pxl_interop_destroy(struct pxl_interop* const interop)
    //

    cudaError_t
    pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height)
    pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height)
    {
    cudaError_t cuda_err = cudaSuccess;

    // save new size
    interop->width = width;
    interop->height = height;

    // resize color buffer
    glNamedRenderbufferStorage(interop->rb0,GL_RGBA8,width,height);

    //
    // REGISTER RBO WITH CUDA
    //
    for (int index=0; index<interop->count; index++)
    {
    // unregister resource
    if (interop->cgr[index] != NULL)
    cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);

    // resize rbo
    glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);

    // cudaGraphicsResource_t cgr;

    // register rbo
    cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr[index],
    interop->rb[index],
    GL_RENDERBUFFER,
    cudaGraphicsRegisterFlagsSurfaceLoadStore |
    cudaGraphicsRegisterFlagsWriteDiscard);

    // map graphics resource
    cuda_err = cudaGraphicsMapResources(1,&interop->cgr[index],interop->stream);

    // get a CUDA Array
    cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[index],
    interop->cgr[index],
    0,0);

    // unmap graphics resource
    cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[index],interop->stream);

    // probe fbo status
    glCheckNamedFramebufferStatus(interop->fb[index],0);
    }

    cudaError_t cuda_err;

    // unregister
    if (interop->cgr0 != NULL)
    cuda_err = cudaGraphicsUnregisterResource(interop->cgr0);

    // register image
    cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr0,
    interop->rb0,
    GL_RENDERBUFFER,
    cudaGraphicsRegisterFlagsSurfaceLoadStore | cudaGraphicsRegisterFlagsWriteDiscard);

    // diddle some flags
    cuda_err = cudaGraphicsResourceSetMapFlags(interop->cgr0,cudaGraphicsMapFlagsWriteDiscard);

    return cuda_err;
    }

    void
    pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height)
    pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height)
    {
    *width = interop->width;
    *height = interop->height;
    @@ -122,38 +164,28 @@ pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* c
    //
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
    cudaArray_const_t
    pxl_interop_array_get(struct pxl_interop* const interop)
    {
    cudaError_t cuda_err;

    // map graphics resources
    cuda_err = cudaGraphicsMapResources(1,&interop->cgr0,stream);

    return cuda_err;
    return interop->ca[interop->index];
    }

    cudaError_t
    pxl_interop_get(struct pxl_interop* const interop, cudaArray_t* cuda_array)
    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop)
    {
    cudaError_t cuda_err;

    // get a CUDA Array
    cuda_err = cudaGraphicsSubResourceGetMappedArray(cuda_array,interop->cgr0,0,0);

    return cuda_err;
    return interop->stream;
    }

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
    {
    cudaError_t cuda_err;

    cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr0,stream);
    //
    //
    //

    return cuda_err;
    void
    pxl_interop_swap(struct pxl_interop* const interop)
    {
    interop->index = (interop->index + 1) % interop->count;
    }

    //
    //
    //
    @@ -169,8 +201,9 @@ pxl_interop_clear(struct pxl_interop* const interop)
    glClearNamedFramebufferuiv(interop->fb0,GL_COLOR,0,clear_color);
    */

    const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
    glInvalidateNamedFramebufferData(interop->fb0,1,attachments);
    static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };

    glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments);
    }

    //
    @@ -180,7 +213,7 @@ pxl_interop_clear(struct pxl_interop* const interop)
    void
    pxl_interop_blit(struct pxl_interop* const interop)
    {
    glBlitNamedFramebuffer(interop->fb0,0,
    glBlitNamedFramebuffer(interop->fb[interop->index],0,
    0,0, interop->width,interop->height,
    0,interop->height,interop->width,0,
    GL_COLOR_BUFFER_BIT,
    20 changes: 10 additions & 10 deletions interop.h
    Original file line number Diff line number Diff line change
    @@ -16,7 +16,7 @@
    //

    struct pxl_interop*
    pxl_interop_create();
    pxl_interop_create(const int fbo_count);

    void
    pxl_interop_destroy(struct pxl_interop* const interop);
    @@ -26,28 +26,28 @@ pxl_interop_destroy(struct pxl_interop* const interop);
    //

    cudaError_t
    pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height);
    pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height);

    void
    pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height);
    pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height);

    //
    //
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);

    cudaError_t
    pxl_interop_get(struct pxl_interop* const interop, cudaArray_t* cuda_array);
    cudaStream_t
    pxl_interop_stream_get(struct pxl_interop* const interop);

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);
    cudaArray_const_t
    pxl_interop_array_get(struct pxl_interop* const interop);

    //
    //
    //

    void
    pxl_interop_swap(struct pxl_interop* const interop);

    void
    pxl_interop_clear(struct pxl_interop* const interop);

    25 changes: 11 additions & 14 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -145,7 +145,7 @@ pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
    // get context
    struct pxl_interop* const interop = glfwGetWindowUserPointer(window);

    pxl_interop_resize(interop,width,height);
    pxl_interop_size_set(interop,width,height);
    }

    //
    @@ -206,7 +206,7 @@ main(int argc, char* argv[])
    // CREATE INTEROP
    //

    struct pxl_interop* const interop = pxl_interop_create();
    struct pxl_interop* const interop = pxl_interop_create(2);

    //
    // RESIZE INTEROP
    @@ -218,7 +218,7 @@ main(int argc, char* argv[])
    glfwGetFramebufferSize(window,&width,&height);

    // resize with initial window dimensions
    cuda_err = pxl_interop_resize(interop,width,height);
    cuda_err = pxl_interop_size_set(interop,width,height);

    //
    // SET USER POINTER AND CALLBACKS
    @@ -244,20 +244,15 @@ main(int argc, char* argv[])
    // EXECUTE CUDA KERNEL ON RENDER BUFFER
    //

    int width,height;
    cudaArray_t cuda_array;
    int width,height;

    pxl_interop_get_size(interop,&width,&height);
    pxl_interop_size_get(interop,&width,&height);

    cuda_err = pxl_interop_map(interop,stream);
    cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
    width,height,
    pxl_interop_stream_get(interop));

    cuda_err = pxl_interop_get(interop,&cuda_array);

    cuda_err = pxl_kernel_launcher(cuda_array,width,height,stream);

    cuda_err = pxl_interop_unmap(interop,stream);

    cuda_err = cudaStreamSynchronize(stream);
    // cuda_err = cudaStreamSynchronize(stream);

    //
    // BLIT
    @@ -269,6 +264,8 @@ main(int argc, char* argv[])
    // SWAP
    //

    pxl_interop_swap(interop);

    glfwSwapBuffers(window);

    //
  20. Allan MacKinnon revised this gist Apr 18, 2015. 1 changed file with 6 additions and 2 deletions.
    8 changes: 6 additions & 2 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -95,8 +95,6 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)
    if (!glfwInit())
    exit(EXIT_FAILURE);

    const GLFWvidmode* mode = glfwGetVideoMode(glfwGetPrimaryMonitor());

    glfwWindowHint(GLFW_DEPTH_BITS, 0);
    glfwWindowHint(GLFW_STENCIL_BITS, 0);

    @@ -107,7 +105,13 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)

    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);

    #ifdef PXL_FULLSCREEN
    GLFWmonitor* monitor = glfwGetPrimaryMonitor();
    const GLFWvidmode* mode = glfwGetVideoMode(monitor);
    *window = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL);
    #else
    *window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL);
    #endif

    if (*window == NULL)
    {
  21. Allan MacKinnon revised this gist Apr 16, 2015. 1 changed file with 14 additions and 7 deletions.
    21 changes: 14 additions & 7 deletions kernel.cu
    Original file line number Diff line number Diff line change
    @@ -29,13 +29,14 @@ union pxl_rgbx_24
    extern "C"
    __global__
    void
    pxl_kernel(const int width)
    pxl_kernel(const int width, const int height)
    {
    // pixel coordinates
    const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
    const int x = idx % width;
    const int y = idx / width;

    #if 1
    // pixel color
    const int t = (unsigned int)clock() / 1250000; // 1.25 GHz
    const int xt = (idx + t) % width;
    @@ -49,12 +50,18 @@ pxl_kernel(const int width)
    rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0;
    rgbx.na = 255;

    // cudaBoundaryModeZero squelches out-of-bound writes
    #else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT
    const bool border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
    union pxl_rgbx_24 rgbx = { border ? 0xFF0000FF : 0xFF000000 };
    #endif

    surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
    surf,
    x*sizeof(rgbx),
    y,
    cudaBoundaryModeZero);
    surf,
    x*sizeof(rgbx),
    y,
    cudaBoundaryModeZero); // squelches out-of-bound writes
    }

    //
    @@ -78,7 +85,7 @@ pxl_kernel_launcher(cudaArray_const_t array,
    const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;

    if (blocks > 0)
    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);
    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);

    return cudaSuccess;
    }
  22. Allan MacKinnon revised this gist Apr 15, 2015. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -6,6 +6,7 @@
    #include <glad/glad.h>
    #include <GLFW/glfw3.h>
    #include <cuda_gl_interop.h>
    #include <stdlib.h>

    //
    //
  23. Allan MacKinnon revised this gist Apr 15, 2015. 2 changed files with 4 additions and 4 deletions.
    6 changes: 4 additions & 2 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -3,13 +3,15 @@
    //
    //

    #include "interop.h"
    #include <glad/glad.h>
    #include <GLFW/glfw3.h>
    #include <cuda_gl_interop.h>

    //
    //
    //

    #include <cuda_gl_interop.h>
    #include "interop.h"

    //
    //
    2 changes: 0 additions & 2 deletions interop.h
    Original file line number Diff line number Diff line change
    @@ -9,8 +9,6 @@
    //
    //

    #include <glad/glad.h>
    #include <GLFW/glfw3.h>
    #include <cuda_runtime.h>

    //
  24. Allan MacKinnon revised this gist Apr 15, 2015. 3 changed files with 2 additions and 9 deletions.
    4 changes: 2 additions & 2 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -3,13 +3,13 @@
    //
    //

    #include <stdlib.h>
    #include "interop.h"

    //
    //
    //

    #include "interop.h"
    #include <cuda_gl_interop.h>

    //
    //
    6 changes: 0 additions & 6 deletions interop.h
    Original file line number Diff line number Diff line change
    @@ -11,13 +11,7 @@

    #include <glad/glad.h>
    #include <GLFW/glfw3.h>

    //
    //
    //

    #include <cuda_runtime.h>
    #include <cuda_gl_interop.h>

    //
    //
    1 change: 0 additions & 1 deletion main.c
    Original file line number Diff line number Diff line change
    @@ -16,7 +16,6 @@
    //
    //

    #include <cuda_runtime.h>
    #include <cuda_gl_interop.h>

    //
  25. Allan MacKinnon revised this gist Apr 15, 2015. 2 changed files with 32 additions and 43 deletions.
    46 changes: 18 additions & 28 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -34,18 +34,21 @@ struct pxl_interop
    //

    struct pxl_interop*
    pxl_interop_create(GLFWwindow* window)
    pxl_interop_create()
    {
    struct pxl_interop* const interop = (struct pxl_interop*)malloc(sizeof(struct pxl_interop));

    // init cuda graphics resource
    interop->cgr0 = NULL;

    // render buffer object w/a color buffer
    glGenRenderbuffers(1,&interop->rb0);
    glCreateRenderbuffers(1,&interop->rb0);

    // frame buffer object
    glGenFramebuffers(1,&interop->fb0);
    glCreateFramebuffers(1,&interop->fb0);

    // attach rb0 to the fb0
    glNamedFramebufferRenderbuffer(interop->fb0,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);

    // return it
    return interop;
    @@ -55,6 +58,17 @@ pxl_interop_create(GLFWwindow* window)
    void
    pxl_interop_destroy(struct pxl_interop* const interop)
    {
    // unregister CUDA resource
    if (interop->cgr0 != NULL)
    cudaGraphicsUnregisterResource(interop->cgr0);

    // render buffer object w/a color buffer
    glDeleteRenderbuffers(1,&interop->rb0);

    // frame buffer object
    glDeleteFramebuffers(1,&interop->fb0);

    // free interop
    free(interop);
    }

    @@ -69,26 +83,9 @@ pxl_interop_resize(struct pxl_interop* const interop, const int width, const int
    interop->width = width;
    interop->height = height;

    //
    // RESIZE FBO'S COLOR BUFFER
    //

    // bind rbo
    glBindRenderbuffer (GL_RENDERBUFFER,interop->rb0);
    // resize color buffer
    glRenderbufferStorage(GL_RENDERBUFFER,GL_RGBA8,width,height);

    // bind fbo to read
    glBindFramebuffer (GL_FRAMEBUFFER,interop->fb0);
    // attach rb0 to the fb0
    glFramebufferRenderbuffer(GL_FRAMEBUFFER,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);

    // unbind rbo
    glBindRenderbuffer(GL_RENDERBUFFER,0);
    glNamedRenderbufferStorage(interop->rb0,GL_RGBA8,width,height);

    // bind to default fb
    glBindFramebuffer(GL_FRAMEBUFFER,0);

    //
    // REGISTER RBO WITH CUDA
    //
    @@ -180,13 +177,6 @@ pxl_interop_clear(struct pxl_interop* const interop)
    void
    pxl_interop_blit(struct pxl_interop* const interop)
    {
    /*
    glBlitFramebuffer(0,0, interop->width,interop->height,
    0,interop->height,interop->width,0,
    GL_COLOR_BUFFER_BIT,
    GL_NEAREST);
    */

    glBlitNamedFramebuffer(interop->fb0,0,
    0,0, interop->width,interop->height,
    0,interop->height,interop->width,0,
    29 changes: 14 additions & 15 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -124,11 +124,11 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)
    // ignore vsync for now
    glfwSwapInterval(0);

    // only copy r/g/b
    glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);

    // enable SRGB
    // glEnable(GL_FRAMEBUFFER_SRGB);

    // only copy r/g/b
    // glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
    }

    //
    @@ -200,22 +200,13 @@ main(int argc, char* argv[])
    cuda_err = cudaStreamCreate(&stream);

    //
    // CREATE AND SAVE INTEROP INSTANCE
    // CREATE INTEROP
    //

    struct pxl_interop* const interop = pxl_interop_create(window);

    glfwSetWindowUserPointer(window,interop);
    struct pxl_interop* const interop = pxl_interop_create();

    //
    // SET CALLBACKS
    //

    glfwSetKeyCallback (window,pxl_glfw_key_callback);
    glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);

    //
    // GET ACTUAL WINDOW SIZE
    // RESIZE INTEROP
    //

    int width, height;
    @@ -226,6 +217,14 @@ main(int argc, char* argv[])
    // resize with initial window dimensions
    cuda_err = pxl_interop_resize(interop,width,height);

    //
    // SET USER POINTER AND CALLBACKS
    //

    glfwSetWindowUserPointer (window,interop);
    glfwSetKeyCallback (window,pxl_glfw_key_callback);
    glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);

    //
    // LOOP UNTIL DONE
    //
  26. Allan MacKinnon revised this gist Apr 14, 2015. 1 changed file with 3 additions and 4 deletions.
    7 changes: 3 additions & 4 deletions interop.h
    Original file line number Diff line number Diff line change
    @@ -1,8 +1,7 @@

    /*
    * Copyright 2015 Allan MacKinnon. All rights reserved.
    *
    */
    //
    //
    //

    #pragma once

  27. Allan MacKinnon revised this gist Apr 14, 2015. 3 changed files with 18 additions and 5 deletions.
    12 changes: 10 additions & 2 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -123,13 +123,21 @@ pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* c
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream)
    pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
    {
    cudaError_t cuda_err;

    // map graphics resources
    cuda_err = cudaGraphicsMapResources(1,&interop->cgr0,stream);

    return cuda_err;
    }

    cudaError_t
    pxl_interop_get(struct pxl_interop* const interop, cudaArray_t* cuda_array)
    {
    cudaError_t cuda_err;

    // get a CUDA Array
    cuda_err = cudaGraphicsSubResourceGetMappedArray(cuda_array,interop->cgr0,0,0);

    @@ -188,4 +196,4 @@ pxl_interop_blit(struct pxl_interop* const interop)

    //
    //
    //
    //
    7 changes: 5 additions & 2 deletions interop.h
    Original file line number Diff line number Diff line change
    @@ -45,7 +45,10 @@ pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* c
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream);
    pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);

    cudaError_t
    pxl_interop_get(struct pxl_interop* const interop, cudaArray_t* cuda_array);

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);
    @@ -62,4 +65,4 @@ pxl_interop_blit(struct pxl_interop* const interop);

    //
    //
    //
    //
    4 changes: 3 additions & 1 deletion main.c
    Original file line number Diff line number Diff line change
    @@ -247,7 +247,9 @@ main(int argc, char* argv[])

    pxl_interop_get_size(interop,&width,&height);

    cuda_err = pxl_interop_map(interop,&cuda_array,stream);
    cuda_err = pxl_interop_map(interop,stream);

    cuda_err = pxl_interop_get(interop,&cuda_array);

    cuda_err = pxl_kernel_launcher(cuda_array,width,height,stream);

  28. Allan MacKinnon revised this gist Apr 14, 2015. 2 changed files with 10 additions and 9 deletions.
    5 changes: 3 additions & 2 deletions kernel.cu
    Original file line number Diff line number Diff line change
    @@ -61,7 +61,7 @@ pxl_kernel(const int width)
    //
    //

    #define PXL_KERNEL_THREADS_PER_BLOCK 64
    #define PXL_KERNEL_THREADS_PER_BLOCK 256 // enough for 4Kx2 monitor

    extern "C"
    cudaError_t
    @@ -77,7 +77,8 @@ pxl_kernel_launcher(cudaArray_const_t array,

    const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;

    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);
    if (blocks > 0)
    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);

    return cudaSuccess;
    }
    14 changes: 7 additions & 7 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -167,13 +167,6 @@ main(int argc, char* argv[])

    pxl_glfw_init(&window,1024,1024);

    //
    // SET CALLBACKS
    //

    glfwSetKeyCallback (window,pxl_glfw_key_callback);
    glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);

    //
    // INIT CUDA
    //
    @@ -214,6 +207,13 @@ main(int argc, char* argv[])

    glfwSetWindowUserPointer(window,interop);

    //
    // SET CALLBACKS
    //

    glfwSetKeyCallback (window,pxl_glfw_key_callback);
    glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);

    //
    // GET ACTUAL WINDOW SIZE
    //
  29. Allan MacKinnon revised this gist Apr 14, 2015. 2 changed files with 9 additions and 6 deletions.
    9 changes: 6 additions & 3 deletions kernel.cu
    Original file line number Diff line number Diff line change
    @@ -26,6 +26,7 @@ union pxl_rgbx_24
    //
    //

    extern "C"
    __global__
    void
    pxl_kernel(const int width)
    @@ -36,8 +37,10 @@ pxl_kernel(const int width)
    const int y = idx / width;

    // pixel color
    const unsigned int ramp = (unsigned int)(((float)x / (float)(width-1)) * 255.0f + 0.5f);
    const unsigned int bar = (y / 32) & 3;
    const int t = (unsigned int)clock() / 1250000; // 1.25 GHz
    const int xt = (idx + t) % width;
    const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
    const unsigned int bar = ((y + t) / 32) & 3;

    union pxl_rgbx_24 rgbx;

    @@ -81,4 +84,4 @@ pxl_kernel_launcher(cudaArray_const_t array,

    //
    //
    //
    //
    6 changes: 3 additions & 3 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -125,10 +125,10 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)
    glfwSwapInterval(0);

    // enable SRGB
    glEnable(GL_FRAMEBUFFER_SRGB);
    // glEnable(GL_FRAMEBUFFER_SRGB);

    // only copy r/g/b
    glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
    // glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
    }

    //
    @@ -293,4 +293,4 @@ main(int argc, char* argv[])

    //
    //
    //
    //
  30. Allan MacKinnon created this gist Apr 14, 2015.
    191 changes: 191 additions & 0 deletions interop.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,191 @@

    //
    //
    //

    #include <stdlib.h>

    //
    //
    //

    #include "interop.h"

    //
    //
    //

    struct pxl_interop
    {
    // w x h
    int width;
    int height;

    // GL buffers
    GLuint fb0;
    GLuint rb0;

    // CUDA resources
    cudaGraphicsResource_t cgr0;
    };

    //
    //
    //

    struct pxl_interop*
    pxl_interop_create(GLFWwindow* window)
    {
    struct pxl_interop* const interop = (struct pxl_interop*)malloc(sizeof(struct pxl_interop));

    // init cuda graphics resource
    interop->cgr0 = NULL;

    // render buffer object w/a color buffer
    glGenRenderbuffers(1,&interop->rb0);

    // frame buffer object
    glGenFramebuffers(1,&interop->fb0);

    // return it
    return interop;
    }


    void
    pxl_interop_destroy(struct pxl_interop* const interop)
    {
    free(interop);
    }

    //
    //
    //

    cudaError_t
    pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height)
    {
    // save new size
    interop->width = width;
    interop->height = height;

    //
    // RESIZE FBO'S COLOR BUFFER
    //

    // bind rbo
    glBindRenderbuffer (GL_RENDERBUFFER,interop->rb0);
    // resize color buffer
    glRenderbufferStorage(GL_RENDERBUFFER,GL_RGBA8,width,height);

    // bind fbo to read
    glBindFramebuffer (GL_FRAMEBUFFER,interop->fb0);
    // attach rb0 to the fb0
    glFramebufferRenderbuffer(GL_FRAMEBUFFER,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);

    // unbind rbo
    glBindRenderbuffer(GL_RENDERBUFFER,0);

    // bind to default fb
    glBindFramebuffer(GL_FRAMEBUFFER,0);

    //
    // REGISTER RBO WITH CUDA
    //

    cudaError_t cuda_err;

    // unregister
    if (interop->cgr0 != NULL)
    cuda_err = cudaGraphicsUnregisterResource(interop->cgr0);

    // register image
    cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr0,
    interop->rb0,
    GL_RENDERBUFFER,
    cudaGraphicsRegisterFlagsSurfaceLoadStore | cudaGraphicsRegisterFlagsWriteDiscard);

    // diddle some flags
    cuda_err = cudaGraphicsResourceSetMapFlags(interop->cgr0,cudaGraphicsMapFlagsWriteDiscard);

    return cuda_err;
    }

    void
    pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height)
    {
    *width = interop->width;
    *height = interop->height;
    }

    //
    //
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream)
    {
    cudaError_t cuda_err;

    // map graphics resources
    cuda_err = cudaGraphicsMapResources(1,&interop->cgr0,stream);

    // get a CUDA Array
    cuda_err = cudaGraphicsSubResourceGetMappedArray(cuda_array,interop->cgr0,0,0);

    return cuda_err;
    }

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
    {
    cudaError_t cuda_err;

    cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr0,stream);

    return cuda_err;
    }

    //
    //
    //

    void
    pxl_interop_clear(struct pxl_interop* const interop)
    {
    /*
    const GLenum draw_buffer[] = { GL_COLOR_ATTACHMENT0 };
    const GLuint clear_color[] = { 255, 0, 0, 255 };
    glNamedFramebufferDrawBuffers(interop->fb0,1,draw_buffer);
    glClearNamedFramebufferuiv(interop->fb0,GL_COLOR,0,clear_color);
    */

    const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
    glInvalidateNamedFramebufferData(interop->fb0,1,attachments);
    }

    //
    //
    //

    void
    pxl_interop_blit(struct pxl_interop* const interop)
    {
    /*
    glBlitFramebuffer(0,0, interop->width,interop->height,
    0,interop->height,interop->width,0,
    GL_COLOR_BUFFER_BIT,
    GL_NEAREST);
    */

    glBlitNamedFramebuffer(interop->fb0,0,
    0,0, interop->width,interop->height,
    0,interop->height,interop->width,0,
    GL_COLOR_BUFFER_BIT,
    GL_NEAREST);
    }

    //
    //
    //
    65 changes: 65 additions & 0 deletions interop.h
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,65 @@

    /*
    * Copyright 2015 Allan MacKinnon. All rights reserved.
    *
    */

    #pragma once

    //
    //
    //

    #include <glad/glad.h>
    #include <GLFW/glfw3.h>

    //
    //
    //

    #include <cuda_runtime.h>
    #include <cuda_gl_interop.h>

    //
    //
    //

    struct pxl_interop*
    pxl_interop_create();

    void
    pxl_interop_destroy(struct pxl_interop* const interop);

    //
    //
    //

    cudaError_t
    pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height);

    void
    pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height);

    //
    //
    //

    cudaError_t
    pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream);

    cudaError_t
    pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);

    //
    //
    //

    void
    pxl_interop_clear(struct pxl_interop* const interop);

    void
    pxl_interop_blit(struct pxl_interop* const interop);

    //
    //
    //
    84 changes: 84 additions & 0 deletions kernel.cu
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,84 @@
    // -*- compile-command: "nvcc -m 32 -arch sm_30 -Xptxas=-v -cubin kernel.cu"; -*-

    //
    //
    //

    surface<void,cudaSurfaceType2D> surf;

    //
    //
    //

    union pxl_rgbx_24
    {
    uint1 b32;

    struct {
    unsigned r : 8;
    unsigned g : 8;
    unsigned b : 8;
    unsigned na : 8;
    };
    };

    //
    //
    //

    __global__
    void
    pxl_kernel(const int width)
    {
    // pixel coordinates
    const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
    const int x = idx % width;
    const int y = idx / width;

    // pixel color
    const unsigned int ramp = (unsigned int)(((float)x / (float)(width-1)) * 255.0f + 0.5f);
    const unsigned int bar = (y / 32) & 3;

    union pxl_rgbx_24 rgbx;

    rgbx.r = (bar == 0) || (bar == 1) ? ramp : 0;
    rgbx.g = (bar == 0) || (bar == 2) ? ramp : 0;
    rgbx.b = (bar == 0) || (bar == 3) ? ramp : 0;
    rgbx.na = 255;

    // cudaBoundaryModeZero squelches out-of-bound writes
    surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
    surf,
    x*sizeof(rgbx),
    y,
    cudaBoundaryModeZero);
    }

    //
    //
    //

    #define PXL_KERNEL_THREADS_PER_BLOCK 64

    extern "C"
    cudaError_t
    pxl_kernel_launcher(cudaArray_const_t array,
    const int width,
    const int height,
    cudaStream_t stream)
    {
    cudaError_t cuda_err = cudaBindSurfaceToArray(surf,array);

    if (cuda_err)
    return cuda_err;

    const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;

    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);

    return cudaSuccess;
    }

    //
    //
    //
    296 changes: 296 additions & 0 deletions main.c
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,296 @@
    //
    //
    //

    #include <glad/glad.h>
    #include <GLFW/glfw3.h>

    //
    //
    //

    #include <stdlib.h>
    #include <stdio.h>

    //
    //
    //

    #include <cuda_runtime.h>
    #include <cuda_gl_interop.h>

    //
    //
    //

    #include "interop.h"

    //
    // FPS COUNTER FROM HERE:
    //
    // http://antongerdelan.net/opengl/glcontext2.html
    //

    static
    void
    pxl_glfw_fps(GLFWwindow* window)
    {
    // static fps counters
    static double stamp_prev = 0.0;
    static int frame_count = 0;

    // locals
    const double stamp_curr = glfwGetTime();
    const double elapsed = stamp_curr - stamp_prev;

    if (elapsed > 0.5)
    {
    stamp_prev = stamp_curr;

    const double fps = (double)frame_count / elapsed;

    int width, height;
    char tmp[64];

    glfwGetFramebufferSize(window,&width,&height);

    sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps);

    glfwSetWindowTitle(window,tmp);

    frame_count = 0;
    }

    frame_count++;
    }

    //
    //
    //

    static
    void
    pxl_glfw_error_callback(int error, const char* description)
    {
    fputs(description,stderr);
    }

    static
    void
    pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods)
    {
    if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
    glfwSetWindowShouldClose(window, GL_TRUE);
    }

    static
    void
    pxl_glfw_init(GLFWwindow** window, const int width, const int height)
    {
    //
    // INITIALIZE GLFW/GLAD
    //

    glfwSetErrorCallback(pxl_glfw_error_callback);

    if (!glfwInit())
    exit(EXIT_FAILURE);

    const GLFWvidmode* mode = glfwGetVideoMode(glfwGetPrimaryMonitor());

    glfwWindowHint(GLFW_DEPTH_BITS, 0);
    glfwWindowHint(GLFW_STENCIL_BITS, 0);

    glfwWindowHint(GLFW_SRGB_CAPABLE, GL_TRUE);

    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);

    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);

    *window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL);

    if (*window == NULL)
    {
    glfwTerminate();
    exit(EXIT_FAILURE);
    }

    glfwMakeContextCurrent(*window);

    // set up GLAD
    gladLoadGLLoader((GLADloadproc)glfwGetProcAddress);

    // ignore vsync for now
    glfwSwapInterval(0);

    // enable SRGB
    glEnable(GL_FRAMEBUFFER_SRGB);

    // only copy r/g/b
    glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
    }

    //
    //
    //

    static
    void
    pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
    {
    // get context
    struct pxl_interop* const interop = glfwGetWindowUserPointer(window);

    pxl_interop_resize(interop,width,height);
    }

    //
    //
    //

    cudaError_t
    pxl_kernel_launcher(cudaArray_const_t array, const int width, const int height, cudaStream_t stream);

    //
    //
    //

    int
    main(int argc, char* argv[])
    {
    //
    // INIT GLFW
    //

    GLFWwindow* window;

    pxl_glfw_init(&window,1024,1024);

    //
    // SET CALLBACKS
    //

    glfwSetKeyCallback (window,pxl_glfw_key_callback);
    glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);

    //
    // INIT CUDA
    //
    cudaError_t cuda_err;

    int gl_device_id,gl_device_count;

    cuda_err = cudaGLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll);

    int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;

    cuda_err = cudaSetDevice(cuda_device_id);

    //
    // INFO
    //
    struct cudaDeviceProp props;

    cuda_err = cudaGetDeviceProperties(&props,gl_device_id);
    printf("GL : %-24s (%2d)\n",props.name,props.multiProcessorCount);

    cuda_err = cudaGetDeviceProperties(&props,cuda_device_id);
    printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);

    //
    // CREATE A CUDA STREAM
    //

    cudaStream_t stream;

    cuda_err = cudaStreamCreate(&stream);

    //
    // CREATE AND SAVE INTEROP INSTANCE
    //

    struct pxl_interop* const interop = pxl_interop_create(window);

    glfwSetWindowUserPointer(window,interop);

    //
    // GET ACTUAL WINDOW SIZE
    //

    int width, height;

    // get initial width/height
    glfwGetFramebufferSize(window,&width,&height);

    // resize with initial window dimensions
    cuda_err = pxl_interop_resize(interop,width,height);

    //
    // LOOP UNTIL DONE
    //

    while (!glfwWindowShouldClose(window))
    {
    //
    // MONITOR FPS
    //

    pxl_glfw_fps(window);

    //
    // EXECUTE CUDA KERNEL ON RENDER BUFFER
    //

    int width,height;
    cudaArray_t cuda_array;

    pxl_interop_get_size(interop,&width,&height);

    cuda_err = pxl_interop_map(interop,&cuda_array,stream);

    cuda_err = pxl_kernel_launcher(cuda_array,width,height,stream);

    cuda_err = pxl_interop_unmap(interop,stream);

    cuda_err = cudaStreamSynchronize(stream);

    //
    // BLIT
    //

    pxl_interop_blit(interop);

    //
    // SWAP
    //

    glfwSwapBuffers(window);

    //
    // PUMP/POLL/WAIT
    //

    glfwPollEvents(); // glfwWaitEvents();
    }

    //
    // CLEANUP
    //

    pxl_interop_destroy(interop);

    glfwDestroyWindow(window);

    glfwTerminate();

    cudaDeviceReset();

    // missing some clean up here

    exit(EXIT_SUCCESS);
    }

    //
    //
    //