eruffaldi · May 27, 2016 04:02 · Feb 27, 2016 · Nov 27, 2015 · Nov 27, 2015 · Nov 27, 2015
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -0,0 +1,10 @@
+#sudo pip install --upgrade git+https://github.com/dav1dde/glad.git#egg=glad
+#python main.py --api gl=3.3 --generator=c  --out-path=GL
+find_package(CUDA)
+
+find_package(GLFW)
+include_directories(${GLFW_INCLUDE_DIRS})
+include_directories(GL/include)
+
+cuda_add_executable(main main.c assert_cuda.c interop.c GL/src/glad.c kernel.cu)
+target_link_libraries(main ${GLFW_LIBRARY})
diff --git a/interop.c b/interop.c
@@ -7,7 +7,7 @@
 #include <GLFW/glfw3.h>
 #include <cuda_gl_interop.h>
 #include <stdlib.h>
-
+#include <stdio.h>
 //
 //
 //
@@ -60,20 +60,42 @@ pxl_interop_create(const bool multi_gpu, const int fbo_count)
   interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
   interop->ca  = calloc(fbo_count,sizeof(*(interop->ca)));
 
+  fprintf(stderr,"creating buffers\n");
+#ifdef Opengl4_5
   // render buffer object w/a color buffer
   glCreateRenderbuffers(fbo_count,interop->rb);
 
   // frame buffer object
   glCreateFramebuffers(fbo_count,interop->fb);
+#else
+  glGenRenderbuffers(fbo_count,interop->rb);
+  glGenFramebuffers(fbo_count,interop->fb);
+#endif
+
+  fprintf(stderr,"created buffers\n");
 
   // attach rbo to fbo
   for (int index=0; index<fbo_count; index++)  
     {
+#ifdef Opengl4_5
       glNamedFramebufferRenderbuffer(interop->fb[index],
                                      GL_COLOR_ATTACHMENT0,
                                      GL_RENDERBUFFER,
                                      interop->rb[index]);
+
+#else
+glBindFramebuffer(GL_FRAMEBUFFER,interop->fb[index]);
+      glFramebufferRenderbuffer(GL_FRAMEBUFFER,
+                                     GL_COLOR_ATTACHMENT0,
+                                     GL_RENDERBUFFER,
+                                     interop->rb[index]);
+glBindFramebuffer(GL_FRAMEBUFFER,0);
+#endif
     }
+  fprintf(stderr,"bound buffers\n");
+
+
+
 
   // return it
   return interop;
@@ -128,8 +150,14 @@ pxl_interop_size_set(struct pxl_interop* const interop, const int width, const i
       if (interop->cgr[index] != NULL)
         cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
 
+#ifdef Opengl4_5
       // resize rbo
       glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);
+#else
+      glBindRenderbuffer(GL_RENDERBUFFER,interop->rb[index]);
+      glRenderbufferStorage(GL_RENDERBUFFER,GL_RGBA8,width,height);
+      glBindRenderbuffer(GL_RENDERBUFFER,0);
+#endif
 
       // probe fbo status
       // glCheckNamedFramebufferStatus(interop->fb[index],0);
@@ -244,7 +272,13 @@ pxl_interop_clear(struct pxl_interop* const interop)
   */
 
   const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f };
+#ifdef Opengl4_5
   glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color);
+#else
+    glBindFramebuffer(GL_FRAMEBUFFER,interop->fb[interop->index]);
+  glClearBufferfv(GL_COLOR,0,clear_color);  
+  glBindFramebuffer(GL_FRAMEBUFFER,0);
+#endif
 }
 
 //
@@ -254,11 +288,22 @@ pxl_interop_clear(struct pxl_interop* const interop)
 void
 pxl_interop_blit(struct pxl_interop* const interop)
 {
+#ifdef Opengl4_5
   glBlitNamedFramebuffer(interop->fb[interop->index],0,
                          0,0,              interop->width,interop->height,
                          0,interop->height,interop->width,0,
                          GL_COLOR_BUFFER_BIT,
                          GL_NEAREST);
+#else
+    glBindFramebuffer(GL_READ_FRAMEBUFFER,interop->fb[interop->index]);
+    glBlitFramebuffer(
+                         0,0,              interop->width,interop->height,
+                         0,interop->height,interop->width,0,
+                         GL_COLOR_BUFFER_BIT,
+                         GL_NEAREST);
+  glBindFramebuffer(GL_READ_FRAMEBUFFER,0);
+
+#endif
 }
 
 //

diff --git a/main.c b/main.c
@@ -55,7 +55,7 @@ pxl_glfw_fps(GLFWwindow* window)
 
       glfwGetFramebufferSize(window,&width,&height);
 
-      sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps);
+      sprintf(tmp,"(%u x %u) - FPS: %.2f",width,height,fps);
 
       glfwSetWindowTitle(window,tmp);
 
@@ -100,12 +100,17 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)
   glfwWindowHint(GLFW_DEPTH_BITS,            0);
   glfwWindowHint(GLFW_STENCIL_BITS,          0);
 
+#ifdef Opengl_45
   glfwWindowHint(GLFW_SRGB_CAPABLE,          GL_TRUE);
-
-  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
-  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);
-
-  glfwWindowHint(GLFW_OPENGL_PROFILE,        GLFW_OPENGL_CORE_PROFILE);
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+  glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+#else
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 3);
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 3);
+    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
+    glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GL_TRUE);
+#endif
 
 #ifdef PXL_FULLSCREEN
   GLFWmonitor*       monitor = glfwGetPrimaryMonitor();
@@ -175,16 +180,21 @@ main(int argc, char* argv[])
 
   pxl_glfw_init(&window,1024,1024);
 
+
   //
   // INIT CUDA
   //
   cudaError_t cuda_err;
 
-  int gl_device_id,gl_device_count;
+  int gl_device_id=0,gl_device_count=0,cuda_device_id=0;
+#ifdef Opengl_45
   cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll));
-
-  int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
+  cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
   cuda_err = cuda(SetDevice(cuda_device_id));
+#else
+  cudaGLSetGLDevice(0);
+#endif
+
 
   //
   // MULTI-GPU?
@@ -208,14 +218,17 @@ main(int argc, char* argv[])
   cudaStream_t stream;
   cudaEvent_t  event;
 
+  fprintf(stderr,"before create\n");
   cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault));   // optionally ignore default stream behavior
   cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming);
 
   //
   // CREATE INTEROP
   //
   // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
+  fprintf(stderr,"pxl_interop_create\n");
   struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2);
+  fprintf(stderr,"after pxl_interop_create\n");
 
   //
   // RESIZE INTEROP
@@ -236,6 +249,7 @@ main(int argc, char* argv[])
   glfwSetKeyCallback            (window,pxl_glfw_key_callback);
   glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
 
+  fprintf(stderr,"loop\n");
   //
   // LOOP UNTIL DONE
   //

diff --git a/assert_cuda.c b/assert_cuda.c
@@ -1,8 +1,3 @@
-/*
- * Copyright 2015 Allan MacKinnon.  All rights reserved.
- *
- */
-
 //
 //
 //

diff --git a/assert_cuda.h b/assert_cuda.h
@@ -1,8 +1,6 @@
-
-/*
- * Copyright 2015 Allan MacKinnon.  All rights reserved.
- *
- */
+//
+//
+//
 
 #pragma once
 

diff --git a/assert_cuda.c b/assert_cuda.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2015 Allan MacKinnon.  All rights reserved.
+ *
+ */
+
+//
+//
+//
+
+#include <stdlib.h>
+#include <stdio.h>
+
+//
+//
+//
+
+#include "assert_cuda.h"
+
+//
+//
+//
+
+cudaError_t
+cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort)
+{
+  if (code != cudaSuccess)
+    {
+      fprintf(stderr,"cuda_assert: %s %s %d\n",cudaGetErrorString(code),file,line);
+
+      if (abort)
+        {
+          cudaDeviceReset();          
+          exit(code);
+        }
+    }
+
+  return code;
+}
+
+//
+//
+//
diff --git a/assert_cuda.h b/assert_cuda.h
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright 2015 Allan MacKinnon.  All rights reserved.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <cuda_runtime.h>
+#include <stdbool.h>
+
+//
+// Beware that NVCC doesn't work with C files and __VA_ARGS__
+//
+
+cudaError_t cuda_assert(const cudaError_t code, const char* const file, const int line, const bool abort);
+
+#define cuda(...)  cuda_assert((cuda##__VA_ARGS__), __FILE__, __LINE__, true);
+
+//
+//
+//
+
diff --git a/interop.c b/interop.c
@@ -12,6 +12,7 @@
 //
 //
 
+#include "assert_cuda.h"
 #include "interop.h"
 
 //
@@ -88,7 +89,7 @@ pxl_interop_destroy(struct pxl_interop* const interop)
   for (int index=0; index<interop->count; index++)
     {
       if (interop->cgr[index] != NULL)
-        cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);
+        cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
     }
 
   // delete rbo's
@@ -125,7 +126,7 @@ pxl_interop_size_set(struct pxl_interop* const interop, const int width, const i
     {
       // unregister resource
       if (interop->cgr[index] != NULL)
-        cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);
+        cuda_err = cuda(GraphicsUnregisterResource(interop->cgr[index]));
 
       // resize rbo
       glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);
@@ -134,26 +135,26 @@ pxl_interop_size_set(struct pxl_interop* const interop, const int width, const i
       // glCheckNamedFramebufferStatus(interop->fb[index],0);
 
       // register rbo
-      cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr[index],
-                                             interop->rb[index],
-                                             GL_RENDERBUFFER,
-                                             cudaGraphicsRegisterFlagsSurfaceLoadStore | 
-                                             cudaGraphicsRegisterFlagsWriteDiscard);
+      cuda_err = cuda(GraphicsGLRegisterImage(&interop->cgr[index],
+					      interop->rb[index],
+					      GL_RENDERBUFFER,
+					      cudaGraphicsRegisterFlagsSurfaceLoadStore | 
+					      cudaGraphicsRegisterFlagsWriteDiscard));
     }
 
   // map graphics resources
-  cuda_err = cudaGraphicsMapResources(interop->count,interop->cgr,0);
+  cuda_err = cuda(GraphicsMapResources(interop->count,interop->cgr,0));
 
   // get CUDA Array refernces
   for (int index=0; index<interop->count; index++)
     {
-      cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[index],
-                                                       interop->cgr[index],
-                                                       0,0);
+      cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[index],
+							interop->cgr[index],
+							0,0));
     }
 
   // unmap graphics resources
-  cuda_err = cudaGraphicsUnmapResources(interop->count,interop->cgr,0);
+  cuda_err = cuda(GraphicsUnmapResources(interop->count,interop->cgr,0));
 
   return cuda_err;
 }
@@ -176,7 +177,7 @@ pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
     return cudaSuccess;
 
   // map graphics resources
-  return cudaGraphicsMapResources(1,&interop->cgr[interop->index],stream);
+  return cuda(GraphicsMapResources(1,&interop->cgr[interop->index],stream));
 }
 
 cudaError_t
@@ -185,7 +186,7 @@ pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
   if (!interop->multi_gpu)
     return cudaSuccess;
 
-  return cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],stream);
+  return cuda(GraphicsUnmapResources(1,&interop->cgr[interop->index],stream));
 }
 
 cudaError_t
@@ -198,9 +199,9 @@ pxl_interop_array_map(struct pxl_interop* const interop)
   cudaError_t cuda_err;
 
   // get a CUDA Array
-  cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
-                                                   interop->cgr[interop->index],
-                                                   0,0);
+  cuda_err = cuda(GraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
+						    interop->cgr[interop->index],
+						    0,0));
   return cuda_err;
 }
 

diff --git a/kernel.cu b/kernel.cu
@@ -1,4 +1,24 @@
-// -*- compile-command: "nvcc -m 32 -arch sm_30 -Xptxas=-v -cubin kernel.cu"; -*-
+// -*- compile-command: "nvcc arch sm_50 -Xptxas=-v -cubin kernel.cu"; -*-
+
+//
+//
+//
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "assert_cuda.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+//
+
+#define PXL_KERNEL_THREADS_PER_BLOCK  256 // enough for 4Kx2 monitor
 
 //
 //
@@ -69,8 +89,6 @@ pxl_kernel(const int width, const int height)
 //
 //
 
-#define PXL_KERNEL_THREADS_PER_BLOCK  256 // enough for 4Kx2 monitor
-
 extern "C"
 cudaError_t
 pxl_kernel_launcher(cudaArray_const_t array,
@@ -83,7 +101,7 @@ pxl_kernel_launcher(cudaArray_const_t array,
 
   // cuda_err = cudaEventRecord(event,stream);
 
-  cuda_err = cudaBindSurfaceToArray(surf,array);
+  cuda_err = cuda(BindSurfaceToArray(surf,array));
 
   if (cuda_err)
     return cuda_err;

diff --git a/main.c b/main.c
@@ -23,6 +23,7 @@
 //
 //
 
+#include "assert_cuda.h"
 #include "interop.h"
 
 //
@@ -170,39 +171,35 @@ main(int argc, char* argv[])
   //
   // INIT GLFW
   //
-
   GLFWwindow* window;
 
   pxl_glfw_init(&window,1024,1024);
 
   //
   // INIT CUDA
   //
-
   cudaError_t cuda_err;
 
   int gl_device_id,gl_device_count;
-  cuda_err = cudaGLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll);
+  cuda_err = cuda(GLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll));
 
   int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
-  cuda_err = cudaSetDevice(cuda_device_id);
+  cuda_err = cuda(SetDevice(cuda_device_id));
 
   //
   // MULTI-GPU?
   //
-
   const bool multi_gpu = gl_device_id != cuda_device_id;
 
   //
   // INFO
   //
-
   struct cudaDeviceProp props;
 
-  cuda_err = cudaGetDeviceProperties(&props,gl_device_id);
+  cuda_err = cuda(GetDeviceProperties(&props,gl_device_id));
   printf("GL   : %-24s (%2d)\n",props.name,props.multiProcessorCount);
 
-  cuda_err = cudaGetDeviceProperties(&props,cuda_device_id);
+  cuda_err = cuda(GetDeviceProperties(&props,cuda_device_id));
   printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);
 
   //
@@ -211,32 +208,14 @@ main(int argc, char* argv[])
   cudaStream_t stream;
   cudaEvent_t  event;
 
-  cuda_err = cudaStreamCreateWithFlags(&stream,cudaStreamDefault);   // optionally ignore default stream behavior
-  cuda_err = cudaEventCreateWithFlags(&event,cudaEventBlockingSync); // | cudaEventDisableTiming);
-
-  //
-  // UPDATE DEVICE LIMITS
-  //
-
-  // #ifdef _DEBUG
-  // check printf() FIFO limit
-  size_t fifo;
-
-  cuda_err = cudaDeviceGetLimit(&fifo,cudaLimitPrintfFifoSize);
-
-  printf("fifo = %Iu\n",fifo);
-
-  cuda_err = cudaDeviceSetLimit(cudaLimitPrintfFifoSize,fifo*16);
-  cuda_err = cudaDeviceGetLimit(&fifo,cudaLimitPrintfFifoSize);
-
-  printf("fifo = %Iu\n",fifo);
-  // #endif
+  cuda_err = cuda(StreamCreateWithFlags(&stream,cudaStreamDefault));   // optionally ignore default stream behavior
+  cuda_err = cuda(EventCreateWithFlags(&event,cudaEventBlockingSync)); // | cudaEventDisableTiming);
 
   //
   // CREATE INTEROP
   //
-
-  struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2); // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
+  // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
+  struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2);
 
   //
   // RESIZE INTEROP
@@ -253,27 +232,23 @@ main(int argc, char* argv[])
   //
   // SET USER POINTER AND CALLBACKS
   //
-
   glfwSetWindowUserPointer      (window,interop);
   glfwSetKeyCallback            (window,pxl_glfw_key_callback);
   glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
 
   //
   // LOOP UNTIL DONE
   //
-
   while (!glfwWindowShouldClose(window))
     {
       //
       // MONITOR FPS
       //
-
       pxl_glfw_fps(window);
 
       //
       // EXECUTE CUDA KERNEL ON RENDER BUFFER
       //
-
       int         width,height;
       cudaArray_t cuda_array;
 
@@ -292,38 +267,34 @@ main(int argc, char* argv[])
       //
       // BLIT & SWAP FBO
       // 
-
       pxl_interop_blit(interop);
       // pxl_interop_clear(interop);
       pxl_interop_swap(interop);
 
       //
       // SWAP WINDOW
       //
-
       glfwSwapBuffers(window);
 
       //
       // PUMP/POLL/WAIT
       //
-
       glfwPollEvents(); // glfwWaitEvents();
     }
 
   //
   // CLEANUP
   //
-
   pxl_interop_destroy(interop);
 
   glfwDestroyWindow(window);
 
   glfwTerminate();
 
-  cudaDeviceReset();
+  cuda(DeviceReset());
 
   // missing some clean up here
-  
+
   exit(EXIT_SUCCESS);
 }
 

diff --git a/main.c b/main.c
@@ -182,11 +182,9 @@ main(int argc, char* argv[])
   cudaError_t cuda_err;
 
   int gl_device_id,gl_device_count;
-
   cuda_err = cudaGLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll);
 
   int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
-
   cuda_err = cudaSetDevice(cuda_device_id);
 
   //
@@ -216,6 +214,24 @@ main(int argc, char* argv[])
   cuda_err = cudaStreamCreateWithFlags(&stream,cudaStreamDefault);   // optionally ignore default stream behavior
   cuda_err = cudaEventCreateWithFlags(&event,cudaEventBlockingSync); // | cudaEventDisableTiming);
 
+  //
+  // UPDATE DEVICE LIMITS
+  //
+
+  // #ifdef _DEBUG
+  // check printf() FIFO limit
+  size_t fifo;
+
+  cuda_err = cudaDeviceGetLimit(&fifo,cudaLimitPrintfFifoSize);
+
+  printf("fifo = %Iu\n",fifo);
+
+  cuda_err = cudaDeviceSetLimit(cudaLimitPrintfFifoSize,fifo*16);
+  cuda_err = cudaDeviceGetLimit(&fifo,cudaLimitPrintfFifoSize);
+
+  printf("fifo = %Iu\n",fifo);
+  // #endif
+
   //
   // CREATE INTEROP
   //

diff --git a/main.c b/main.c
@@ -220,7 +220,7 @@ main(int argc, char* argv[])
   // CREATE INTEROP
   //
 
-  struct pxl_interop* const interop = pxl_interop_create(false /*multi_gpu*/,2); // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
+  struct pxl_interop* const interop = pxl_interop_create(true /*multi_gpu*/,2); // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
 
   //
   // RESIZE INTEROP

diff --git a/interop.c b/interop.c
@@ -49,7 +49,7 @@ pxl_interop_create(const bool multi_gpu, const int fbo_count)
 {
   struct pxl_interop* const interop = calloc(1,sizeof(*interop));
 
-  interop->multi_gpu = true; // multi_gpu;
+  interop->multi_gpu = multi_gpu;
   interop->count     = fbo_count;
   interop->index     = 0;
 

diff --git a/kernel.cu b/kernel.cu
@@ -29,7 +29,7 @@ union pxl_rgbx_24
 extern "C"
 __global__
 void
-pxl_kernel(const int width, const int height, const int index)
+pxl_kernel(const int width, const int height)
 {
   // pixel coordinates
   const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
@@ -54,7 +54,7 @@ pxl_kernel(const int width, const int height, const int index)
 #else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT
 
   const bool        border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
-  union pxl_rgbx_24 rgbx   = { border ? 0xFF0000FF : index==0 ? 0xFF000000 : 0xFFFFFFFF };
+  union pxl_rgbx_24 rgbx   = { border ? 0xFF0000FF : 0xFF000000 };
   
 #endif
 
@@ -76,19 +76,27 @@ cudaError_t
 pxl_kernel_launcher(cudaArray_const_t array,
                     const int         width,
                     const int         height,
-                    const int         index,
+                    cudaEvent_t       event,
                     cudaStream_t      stream)
 {
-  cudaError_t cuda_err = cudaBindSurfaceToArray(surf,array);
+  cudaError_t cuda_err;
+
+  // cuda_err = cudaEventRecord(event,stream);
+
+  cuda_err = cudaBindSurfaceToArray(surf,array);
 
   if (cuda_err)
     return cuda_err;
 
   const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;
 
+  // cuda_err = cudaEventRecord(event,stream);
+
   if (blocks > 0)
-    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height,index);
+    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);
 
+  // cuda_err = cudaStreamWaitEvent(stream,event,0);
+
   return cudaSuccess;
 }
 

diff --git a/main.c b/main.c
@@ -157,7 +157,7 @@ cudaError_t
 pxl_kernel_launcher(cudaArray_const_t array, 
                     const int         width, 
                     const int         height,
-                    const int         index,
+                    cudaEvent_t       event,
                     cudaStream_t      stream);
 
 //
@@ -207,11 +207,20 @@ main(int argc, char* argv[])
   cuda_err = cudaGetDeviceProperties(&props,cuda_device_id);
   printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);
 
+  //
+  // CREATE CUDA STREAM & EVENT
+  //
+  cudaStream_t stream;
+  cudaEvent_t  event;
+
+  cuda_err = cudaStreamCreateWithFlags(&stream,cudaStreamDefault);   // optionally ignore default stream behavior
+  cuda_err = cudaEventCreateWithFlags(&event,cudaEventBlockingSync); // | cudaEventDisableTiming);
+
   //
   // CREATE INTEROP
   //
 
-  struct pxl_interop* const interop = pxl_interop_create(multi_gpu,2);
+  struct pxl_interop* const interop = pxl_interop_create(false /*multi_gpu*/,2); // TESTING -- DO NOT SET TO FALSE, ONLY TRUE IS RELIABLE
 
   //
   // RESIZE INTEROP
@@ -254,21 +263,22 @@ main(int argc, char* argv[])
 
       pxl_interop_size_get(interop,&width,&height);
 
-      cuda_err = pxl_interop_map(interop,0);
+      cuda_err = pxl_interop_map(interop,stream);
 
       cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
-                                     width,height,
-                                     pxl_interop_index_get(interop),
-                                     0);
+                                     width,
+                                     height,
+                                     event,
+                                     stream);
 
-      cuda_err = pxl_interop_unmap(interop,0);
+      cuda_err = pxl_interop_unmap(interop,stream);
 
       //
       // BLIT & SWAP FBO
       // 
 
       pxl_interop_blit(interop);
-      pxl_interop_clear(interop);
+      // pxl_interop_clear(interop);
       pxl_interop_swap(interop);
 
       //

diff --git a/interop.c b/interop.c
@@ -49,7 +49,7 @@ pxl_interop_create(const bool multi_gpu, const int fbo_count)
 {
   struct pxl_interop* const interop = calloc(1,sizeof(*interop));
 
-  interop->multi_gpu = multi_gpu;
+  interop->multi_gpu = true; // multi_gpu;
   interop->count     = fbo_count;
   interop->index     = 0;
 
@@ -238,16 +238,12 @@ void
 pxl_interop_clear(struct pxl_interop* const interop)
 {
   /*
-  const GLenum draw_buffer[] = { GL_COLOR_ATTACHMENT0 };
-  const GLuint clear_color[] = { 255, 0, 0, 255 };
-                       
-  glNamedFramebufferDrawBuffers(interop->fb0,1,draw_buffer);
-  glClearNamedFramebufferuiv(interop->fb0,GL_COLOR,0,clear_color);
-  */
-
   static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
-
   glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments);
+  */
+
+  const GLfloat clear_color[] = { 1.0f, 1.0f, 1.0f, 1.0f };
+  glClearNamedFramebufferfv(interop->fb[interop->index],GL_COLOR,0,clear_color);
 }
 
 //

diff --git a/main.c b/main.c
@@ -268,6 +268,7 @@ main(int argc, char* argv[])
       // 
 
       pxl_interop_blit(interop);
+      pxl_interop_clear(interop);
       pxl_interop_swap(interop);
 
       //

diff --git a/interop.c b/interop.c
@@ -47,8 +47,6 @@ struct pxl_interop
 struct pxl_interop*
 pxl_interop_create(const bool multi_gpu, const int fbo_count)
 {
-  cudaError_t cuda_err;
-
   struct pxl_interop* const interop = calloc(1,sizeof(*interop));
 
   interop->multi_gpu = multi_gpu;

diff --git a/interop.c b/interop.c
@@ -193,6 +193,10 @@ pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
 cudaError_t
 pxl_interop_array_map(struct pxl_interop* const interop)
 {
+  //
+  // FIXME -- IS THIS EVEN NEEDED?
+  //
+
   cudaError_t cuda_err;
 
   // get a CUDA Array

diff --git a/main.c b/main.c
@@ -256,8 +256,6 @@ main(int argc, char* argv[])
 
       cuda_err = pxl_interop_map(interop,0);
 
-      // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED ANYMORE?
-
       cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
                                      width,height,
                                      pxl_interop_index_get(interop),

diff --git a/interop.c b/interop.c
@@ -20,6 +20,9 @@
 
 struct pxl_interop
 {
+  // split GPUs?
+  bool                    multi_gpu;
+
   // number of fbo's
   int                     count;
   int                     index;
@@ -42,14 +45,15 @@ struct pxl_interop
 //
 
 struct pxl_interop*
-pxl_interop_create(const int fbo_count)
+pxl_interop_create(const bool multi_gpu, const int fbo_count)
 {
   cudaError_t cuda_err;
 
   struct pxl_interop* const interop = calloc(1,sizeof(*interop));
 
-  interop->count = fbo_count;
-  interop->index = 0;
+  interop->multi_gpu = multi_gpu;
+  interop->count     = fbo_count;
+  interop->index     = 0;
 
   // allocate arrays
   interop->fb  = calloc(fbo_count,sizeof(*(interop->fb )));
@@ -170,22 +174,20 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
 cudaError_t
 pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
 {
-  cudaError_t cuda_err;
-
+  if (!interop->multi_gpu)
+    return cudaSuccess;
+
   // map graphics resources
-  cuda_err = cudaGraphicsMapResources(1,&interop->cgr[interop->index],stream);
-
-  return cuda_err;
+  return cudaGraphicsMapResources(1,&interop->cgr[interop->index],stream);
 }
 
 cudaError_t
 pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
 {
-  cudaError_t cuda_err;
-
-  cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],stream);
-
-  return cuda_err;
+  if (!interop->multi_gpu)
+    return cudaSuccess;
+
+  return cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],stream);
 }
 
 cudaError_t

diff --git a/interop.h b/interop.h
@@ -10,13 +10,14 @@
 //
 
 #include <cuda_runtime.h>
+#include <stdbool.h>
 
 //
 //
 //
 
 struct pxl_interop*
-pxl_interop_create(const int fbo_count);
+pxl_interop_create(const bool multi_gpu, const int fbo_count);
 
 void
 pxl_interop_destroy(struct pxl_interop* const interop);

diff --git a/main.c b/main.c
@@ -211,7 +211,7 @@ main(int argc, char* argv[])
   // CREATE INTEROP
   //
 
-  struct pxl_interop* const interop = pxl_interop_create(2);
+  struct pxl_interop* const interop = pxl_interop_create(multi_gpu,2);
 
   //
   // RESIZE INTEROP
@@ -237,8 +237,6 @@ main(int argc, char* argv[])
   // LOOP UNTIL DONE
   //
 
-  int step = 0;
-
   while (!glfwWindowShouldClose(window))
     {
       //
@@ -256,21 +254,16 @@ main(int argc, char* argv[])
 
       pxl_interop_size_get(interop,&width,&height);
 
-      if (multi_gpu)
-        {
-          cuda_err = pxl_interop_map(interop,0);
-          // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED?
-        }
+      cuda_err = pxl_interop_map(interop,0);
+
+      // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED ANYMORE?
 
       cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
                                      width,height,
                                      pxl_interop_index_get(interop),
                                      0);
 
-      if (multi_gpu)
-        {
-          cuda_err = pxl_interop_unmap(interop,0);
-        }
+      cuda_err = pxl_interop_unmap(interop,0);
 
       //
       // BLIT & SWAP FBO

diff --git a/interop.c b/interop.c
@@ -35,9 +35,6 @@ struct pxl_interop
   // CUDA resources
   cudaGraphicsResource_t* cgr;
   cudaArray_t*            ca;
-
-  // CUDA streams
-  cudaStream_t*           stream;
 };
 
 //
@@ -55,11 +52,10 @@ pxl_interop_create(const int fbo_count)
   interop->index = 0;
 
   // allocate arrays
-  interop->fb     = calloc(fbo_count,sizeof(*(interop->fb )));
-  interop->rb     = calloc(fbo_count,sizeof(*(interop->rb )));
-  interop->cgr    = calloc(fbo_count,sizeof(*(interop->cgr)));
-  interop->ca     = calloc(fbo_count,sizeof(*(interop->ca)));
-  interop->stream = calloc(fbo_count,sizeof(*(interop->stream)));
+  interop->fb  = calloc(fbo_count,sizeof(*(interop->fb )));
+  interop->rb  = calloc(fbo_count,sizeof(*(interop->rb )));
+  interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
+  interop->ca  = calloc(fbo_count,sizeof(*(interop->ca)));
 
   // render buffer object w/a color buffer
   glCreateRenderbuffers(fbo_count,interop->rb);
@@ -74,8 +70,6 @@ pxl_interop_create(const int fbo_count)
                                      GL_COLOR_ATTACHMENT0,
                                      GL_RENDERBUFFER,
                                      interop->rb[index]);
-
-      cuda_err = cudaStreamCreate(&interop->stream[index]);
     }
 
   // return it
@@ -93,8 +87,6 @@ pxl_interop_destroy(struct pxl_interop* const interop)
     {
       if (interop->cgr[index] != NULL)
         cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);
-
-      cuda_err = cudaStreamDestroy(interop->stream[index]);
     }
 
   // delete rbo's
@@ -108,7 +100,6 @@ pxl_interop_destroy(struct pxl_interop* const interop)
   free(interop->rb);
   free(interop->cgr);
   free(interop->ca);
-  free(interop->stream);
 
   // free interop
   free(interop);
@@ -177,24 +168,22 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
 //
 
 cudaError_t
-pxl_interop_map(struct pxl_interop* const interop)
+pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
 {
   cudaError_t cuda_err;
 
   // map graphics resources
-  cuda_err = cudaGraphicsMapResources(1,&interop->cgr[interop->index],
-                                      interop->stream[interop->index]);
+  cuda_err = cudaGraphicsMapResources(1,&interop->cgr[interop->index],stream);
 
   return cuda_err;
 }
 
 cudaError_t
-pxl_interop_unmap(struct pxl_interop* const interop)
+pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
 {
   cudaError_t cuda_err;
 
-  cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],
-                                        interop->stream[interop->index]);
+  cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],stream);
 
   return cuda_err;
 }
@@ -221,12 +210,6 @@ pxl_interop_array_get(struct pxl_interop* const interop)
   return interop->ca[interop->index];
 }
 
-cudaStream_t
-pxl_interop_stream_get(struct pxl_interop* const interop)
-{
-  return interop->stream[interop->index];
-}
-
 int
 pxl_interop_index_get(struct pxl_interop* const interop)
 {

diff --git a/interop.h b/interop.h
@@ -36,13 +36,13 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
 //
 
 cudaError_t
-pxl_interop_map(struct pxl_interop* const interop);
+pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);
 
 cudaError_t
-pxl_interop_array_map(struct pxl_interop* const interop);
+pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);
 
 cudaError_t
-pxl_interop_unmap(struct pxl_interop* const interop);
+pxl_interop_array_map(struct pxl_interop* const interop);
 
 //
 //

diff --git a/main.c b/main.c
@@ -258,27 +258,25 @@ main(int argc, char* argv[])
 
       if (multi_gpu)
         {
-          cuda_err = pxl_interop_map(interop);
+          cuda_err = pxl_interop_map(interop,0);
           // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED?
         }
 
       cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
                                      width,height,
                                      pxl_interop_index_get(interop),
-                                     pxl_interop_stream_get(interop));
+                                     0);
 
       if (multi_gpu)
         {
-          cuda_err = pxl_interop_unmap(interop);
-          // cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop)); // NOT NEEDED?
+          cuda_err = pxl_interop_unmap(interop,0);
         }
 
       //
       // BLIT & SWAP FBO
       // 
 
       pxl_interop_blit(interop);
-
       pxl_interop_swap(interop);
 
       //

diff --git a/kernel.cu b/kernel.cu
@@ -37,6 +37,7 @@ pxl_kernel(const int width, const int height, const int index)
   const int y   = idx / width;
 
 #if 1
+
   // pixel color
   const int          t    = (unsigned int)clock() / 1100000; // 1.1 GHz
   const int          xt   = (idx + t) % width;

diff --git a/interop.c b/interop.c
@@ -176,16 +176,6 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
 //
 //
 
-cudaStream_t
-pxl_interop_stream_get(struct pxl_interop* const interop)
-{
-  return interop->stream[interop->index];
-}
-
-//
-//
-//
-
 cudaError_t
 pxl_interop_map(struct pxl_interop* const interop)
 {
@@ -231,6 +221,18 @@ pxl_interop_array_get(struct pxl_interop* const interop)
   return interop->ca[interop->index];
 }
 
+cudaStream_t
+pxl_interop_stream_get(struct pxl_interop* const interop)
+{
+  return interop->stream[interop->index];
+}
+
+int
+pxl_interop_index_get(struct pxl_interop* const interop)
+{
+  return interop->index;
+}
+
 //
 //
 //

diff --git a/interop.h b/interop.h
@@ -35,13 +35,6 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
 //
 //
 
-cudaStream_t
-pxl_interop_stream_get(struct pxl_interop* const interop);
-
-//
-//
-//
-
 cudaError_t
 pxl_interop_map(struct pxl_interop* const interop);
 
@@ -58,6 +51,12 @@ pxl_interop_unmap(struct pxl_interop* const interop);
 cudaArray_const_t
 pxl_interop_array_get(struct pxl_interop* const interop);
 
+cudaStream_t
+pxl_interop_stream_get(struct pxl_interop* const interop);
+
+int
+pxl_interop_index_get(struct pxl_interop* const interop);
+
 //
 //
 //

diff --git a/kernel.cu b/kernel.cu
@@ -29,7 +29,7 @@ union pxl_rgbx_24
 extern "C"
 __global__
 void
-pxl_kernel(const int width, const int height)
+pxl_kernel(const int width, const int height, const int index)
 {
   // pixel coordinates
   const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
@@ -38,7 +38,7 @@ pxl_kernel(const int width, const int height)
 
 #if 1
   // pixel color
-  const int          t    = (unsigned int)clock() / 1250000; // 1.25 GHz
+  const int          t    = (unsigned int)clock() / 1100000; // 1.1 GHz
   const int          xt   = (idx + t) % width;
   const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
   const unsigned int bar  = ((y + t) / 32) & 3;
@@ -53,7 +53,7 @@ pxl_kernel(const int width, const int height)
 #else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT
 
   const bool        border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
-  union pxl_rgbx_24 rgbx   = { border ? 0xFF0000FF : 0xFF000000 };
+  union pxl_rgbx_24 rgbx   = { border ? 0xFF0000FF : index==0 ? 0xFF000000 : 0xFFFFFFFF };
   
 #endif
 
@@ -75,6 +75,7 @@ cudaError_t
 pxl_kernel_launcher(cudaArray_const_t array,
                     const int         width,
                     const int         height,
+                    const int         index,
                     cudaStream_t      stream)
 {
   cudaError_t cuda_err = cudaBindSurfaceToArray(surf,array);
@@ -85,7 +86,7 @@ pxl_kernel_launcher(cudaArray_const_t array,
   const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;
 
   if (blocks > 0)
-    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);
+    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height,index);
 
   return cudaSuccess;
 }

diff --git a/main.c b/main.c
@@ -157,6 +157,7 @@ cudaError_t
 pxl_kernel_launcher(cudaArray_const_t array, 
                     const int         width, 
                     const int         height,
+                    const int         index,
                     cudaStream_t      stream);
 
 //
@@ -258,28 +259,30 @@ main(int argc, char* argv[])
       if (multi_gpu)
         {
           cuda_err = pxl_interop_map(interop);
-          // cuda_err = pxl_interop_array_map(interop);
+          // cuda_err = pxl_interop_array_map(interop); // NOT NEEDED?
         }
 
       cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
                                      width,height,
+                                     pxl_interop_index_get(interop),
                                      pxl_interop_stream_get(interop));
 
       if (multi_gpu)
         {
           cuda_err = pxl_interop_unmap(interop);
-          // cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop));
+          // cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop)); // NOT NEEDED?
         }
 
       //
-      // BLIT
+      // BLIT & SWAP FBO
       // 
 
       pxl_interop_blit(interop);
+
       pxl_interop_swap(interop);
 
       //
-      // SWAP
+      // SWAP WINDOW
       //
 
       glfwSwapBuffers(window);

diff --git a/main.c b/main.c
@@ -268,7 +268,7 @@ main(int argc, char* argv[])
       if (multi_gpu)
         {
           cuda_err = pxl_interop_unmap(interop);
-          cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop));
+          // cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop));
         }
 
       //

diff --git a/main.c b/main.c
@@ -258,7 +258,7 @@ main(int argc, char* argv[])
       if (multi_gpu)
         {
           cuda_err = pxl_interop_map(interop);
-          cuda_err = pxl_interop_array_map(interop);
+          // cuda_err = pxl_interop_array_map(interop);
         }
 
       cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),

diff --git a/interop.c b/interop.c
@@ -20,6 +20,7 @@
 
 struct pxl_interop
 {
+  // number of fbo's
   int                     count;
   int                     index;
 

diff --git a/interop.c b/interop.c
@@ -35,8 +35,8 @@ struct pxl_interop
   cudaGraphicsResource_t* cgr;
   cudaArray_t*            ca;
 
-  // CUDA stream
-  cudaStream_t            stream;
+  // CUDA streams
+  cudaStream_t*           stream;
 };
 
 //
@@ -46,33 +46,36 @@ struct pxl_interop
 struct pxl_interop*
 pxl_interop_create(const int fbo_count)
 {
+  cudaError_t cuda_err;
+
   struct pxl_interop* const interop = calloc(1,sizeof(*interop));
 
   interop->count = fbo_count;
+  interop->index = 0;
 
   // allocate arrays
-  interop->fb  = calloc(fbo_count,sizeof(*(interop->fb )));
-  interop->rb  = calloc(fbo_count,sizeof(*(interop->rb )));
-  interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
-  interop->ca  = calloc(fbo_count,sizeof(*(interop->ca)));
+  interop->fb     = calloc(fbo_count,sizeof(*(interop->fb )));
+  interop->rb     = calloc(fbo_count,sizeof(*(interop->rb )));
+  interop->cgr    = calloc(fbo_count,sizeof(*(interop->cgr)));
+  interop->ca     = calloc(fbo_count,sizeof(*(interop->ca)));
+  interop->stream = calloc(fbo_count,sizeof(*(interop->stream)));
 
   // render buffer object w/a color buffer
   glCreateRenderbuffers(fbo_count,interop->rb);
 
   // frame buffer object
   glCreateFramebuffers(fbo_count,interop->fb);
 
-  // attach rb0 to the fb0
+  // attach rbo to fbo
   for (int index=0; index<fbo_count; index++)  
     {
       glNamedFramebufferRenderbuffer(interop->fb[index],
                                      GL_COLOR_ATTACHMENT0,
                                      GL_RENDERBUFFER,
                                      interop->rb[index]);
-    }
 
-  // create an interop stream 
-  cudaError_t cuda_err = cudaStreamCreate(&interop->stream);
+      cuda_err = cudaStreamCreate(&interop->stream[index]);
+    }
 
   // return it
   return interop;
@@ -82,23 +85,30 @@ pxl_interop_create(const int fbo_count)
 void
 pxl_interop_destroy(struct pxl_interop* const interop)
 {
-  // resize color buffer
+  cudaError_t cuda_err;
+
+  // unregister CUDA resources
   for (int index=0; index<interop->count; index++)
     {
-      // unregister CUDA resource
       if (interop->cgr[index] != NULL)
-        cudaGraphicsUnregisterResource(interop->cgr[index]);
-    }
+        cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);
 
-  // destroy stream
-  cudaStreamDestroy(interop->stream);
+      cuda_err = cudaStreamDestroy(interop->stream[index]);
+    }
 
   // delete rbo's
   glDeleteRenderbuffers(interop->count,interop->rb);
 
   // delete fbo's
   glDeleteFramebuffers(interop->count,interop->fb);
 
+  // free buffers and resources
+  free(interop->fb);
+  free(interop->rb);
+  free(interop->cgr);
+  free(interop->ca);
+  free(interop->stream);
+
   // free interop
   free(interop);
 }
@@ -126,29 +136,30 @@ pxl_interop_size_set(struct pxl_interop* const interop, const int width, const i
       // resize rbo
       glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);
 
-      // cudaGraphicsResource_t cgr;      
+      // probe fbo status
+      // glCheckNamedFramebufferStatus(interop->fb[index],0);
 
       // register rbo
       cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr[index],
                                              interop->rb[index],
                                              GL_RENDERBUFFER,
                                              cudaGraphicsRegisterFlagsSurfaceLoadStore | 
                                              cudaGraphicsRegisterFlagsWriteDiscard);
+    }
 
-      // map graphics resource
-      cuda_err = cudaGraphicsMapResources(1,&interop->cgr[index],interop->stream);
+  // map graphics resources
+  cuda_err = cudaGraphicsMapResources(interop->count,interop->cgr,0);
 
-      // get a CUDA Array
+  // get CUDA Array refernces
+  for (int index=0; index<interop->count; index++)
+    {
       cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[index],
                                                        interop->cgr[index],
                                                        0,0);
-
-      // unmap graphics resource
-      cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[index],interop->stream);
-
-      // probe fbo status
-      glCheckNamedFramebufferStatus(interop->fb[index],0);
     }
+
+  // unmap graphics resources
+  cuda_err = cudaGraphicsUnmapResources(interop->count,interop->cgr,0);
 
   return cuda_err;
 }
@@ -164,16 +175,59 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
 //
 //
 
-cudaArray_const_t
-pxl_interop_array_get(struct pxl_interop* const interop)
+cudaStream_t
+pxl_interop_stream_get(struct pxl_interop* const interop)
 {
-  return interop->ca[interop->index];
+  return interop->stream[interop->index];
 }
 
-cudaStream_t
-pxl_interop_stream_get(struct pxl_interop* const interop)
+//
+//
+//
+
+cudaError_t
+pxl_interop_map(struct pxl_interop* const interop)
 {
-  return interop->stream;
+  cudaError_t cuda_err;
+
+  // map graphics resources
+  cuda_err = cudaGraphicsMapResources(1,&interop->cgr[interop->index],
+                                      interop->stream[interop->index]);
+
+  return cuda_err;
+}
+
+cudaError_t
+pxl_interop_unmap(struct pxl_interop* const interop)
+{
+  cudaError_t cuda_err;
+
+  cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[interop->index],
+                                        interop->stream[interop->index]);
+
+  return cuda_err;
+}
+
+cudaError_t
+pxl_interop_array_map(struct pxl_interop* const interop)
+{
+  cudaError_t cuda_err;
+
+  // get a CUDA Array
+  cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[interop->index],
+                                                   interop->cgr[interop->index],
+                                                   0,0);
+  return cuda_err;
+}
+
+//
+//
+//
+
+cudaArray_const_t
+pxl_interop_array_get(struct pxl_interop* const interop)
+{
+  return interop->ca[interop->index];
 }
 
 //

diff --git a/interop.h b/interop.h
@@ -38,6 +38,23 @@ pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* c
 cudaStream_t
 pxl_interop_stream_get(struct pxl_interop* const interop);
 
+//
+//
+//
+
+cudaError_t
+pxl_interop_map(struct pxl_interop* const interop);
+
+cudaError_t
+pxl_interop_array_map(struct pxl_interop* const interop);
+
+cudaError_t
+pxl_interop_unmap(struct pxl_interop* const interop);
+
+//
+//
+//
+
 cudaArray_const_t
 pxl_interop_array_get(struct pxl_interop* const interop);
 

diff --git a/main.c b/main.c
@@ -11,6 +11,7 @@
 
 #include <stdlib.h>
 #include <stdio.h>
+#include <stdbool.h>
 
 //
 //
@@ -153,7 +154,10 @@ pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
 //
 
 cudaError_t
-pxl_kernel_launcher(cudaArray_const_t array, const int width, const int height, cudaStream_t stream);
+pxl_kernel_launcher(cudaArray_const_t array, 
+                    const int         width, 
+                    const int         height,
+                    cudaStream_t      stream);
 
 //
 //
@@ -173,6 +177,7 @@ main(int argc, char* argv[])
   //
   // INIT CUDA
   //
+
   cudaError_t cuda_err;
 
   int gl_device_id,gl_device_count;
@@ -183,9 +188,16 @@ main(int argc, char* argv[])
 
   cuda_err = cudaSetDevice(cuda_device_id);
 
+  //
+  // MULTI-GPU?
+  //
+
+  const bool multi_gpu = gl_device_id != cuda_device_id;
+
   //
   // INFO
   //
+
   struct cudaDeviceProp props;
 
   cuda_err = cudaGetDeviceProperties(&props,gl_device_id);
@@ -194,14 +206,6 @@ main(int argc, char* argv[])
   cuda_err = cudaGetDeviceProperties(&props,cuda_device_id);
   printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);
 
-  //
-  // CREATE A CUDA STREAM
-  //
-
-  cudaStream_t stream;
-
-  cuda_err = cudaStreamCreate(&stream);
-
   //
   // CREATE INTEROP
   //
@@ -231,6 +235,8 @@ main(int argc, char* argv[])
   //
   // LOOP UNTIL DONE
   //
+
+  int step = 0;
 
   while (!glfwWindowShouldClose(window))
     {
@@ -244,27 +250,37 @@ main(int argc, char* argv[])
       // EXECUTE CUDA KERNEL ON RENDER BUFFER
       //
 
-      int width,height;
+      int         width,height;
+      cudaArray_t cuda_array;
 
       pxl_interop_size_get(interop,&width,&height);
 
+      if (multi_gpu)
+        {
+          cuda_err = pxl_interop_map(interop);
+          cuda_err = pxl_interop_array_map(interop);
+        }
+
       cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
                                      width,height,
                                      pxl_interop_stream_get(interop));
 
-      // cuda_err = cudaStreamSynchronize(stream);
+      if (multi_gpu)
+        {
+          cuda_err = pxl_interop_unmap(interop);
+          cuda_err = cudaStreamSynchronize(pxl_interop_stream_get(interop));
+        }
 
       //
       // BLIT
       // 
 
       pxl_interop_blit(interop);
+      pxl_interop_swap(interop);
 
       //
       // SWAP
       //
-
-      pxl_interop_swap(interop);
 
       glfwSwapBuffers(window);
 

diff --git a/interop.c b/interop.c
@@ -20,38 +20,59 @@
 
 struct pxl_interop
 {
+  int                     count;
+  int                     index;
+
   // w x h
-  int                    width;
-  int                    height;
+  int                     width;
+  int                     height;
 
   // GL buffers
-  GLuint                 fb0;
-  GLuint                 rb0;
+  GLuint*                 fb;
+  GLuint*                 rb;
 
   // CUDA resources
-  cudaGraphicsResource_t cgr0;
+  cudaGraphicsResource_t* cgr;
+  cudaArray_t*            ca;
+
+  // CUDA stream
+  cudaStream_t            stream;
 };
 
 //
 //
 //
 
 struct pxl_interop*
-pxl_interop_create()
+pxl_interop_create(const int fbo_count)
 {
-  struct pxl_interop* const interop = (struct pxl_interop*)malloc(sizeof(struct pxl_interop));
+  struct pxl_interop* const interop = calloc(1,sizeof(*interop));
+
+  interop->count = fbo_count;
 
-  // init cuda graphics resource
-  interop->cgr0 = NULL;
+  // allocate arrays
+  interop->fb  = calloc(fbo_count,sizeof(*(interop->fb )));
+  interop->rb  = calloc(fbo_count,sizeof(*(interop->rb )));
+  interop->cgr = calloc(fbo_count,sizeof(*(interop->cgr)));
+  interop->ca  = calloc(fbo_count,sizeof(*(interop->ca)));
 
   // render buffer object w/a color buffer
-  glCreateRenderbuffers(1,&interop->rb0);
+  glCreateRenderbuffers(fbo_count,interop->rb);
 
   // frame buffer object
-  glCreateFramebuffers(1,&interop->fb0);
+  glCreateFramebuffers(fbo_count,interop->fb);
 
   // attach rb0 to the fb0
-  glNamedFramebufferRenderbuffer(interop->fb0,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);
+  for (int index=0; index<fbo_count; index++)  
+    {
+      glNamedFramebufferRenderbuffer(interop->fb[index],
+                                     GL_COLOR_ATTACHMENT0,
+                                     GL_RENDERBUFFER,
+                                     interop->rb[index]);
+    }
+
+  // create an interop stream 
+  cudaError_t cuda_err = cudaStreamCreate(&interop->stream);
 
   // return it
   return interop;
@@ -61,15 +82,22 @@ pxl_interop_create()
 void
 pxl_interop_destroy(struct pxl_interop* const interop)
 {
-  // unregister CUDA resource
-  if (interop->cgr0 != NULL)
-    cudaGraphicsUnregisterResource(interop->cgr0);
+  // resize color buffer
+  for (int index=0; index<interop->count; index++)
+    {
+      // unregister CUDA resource
+      if (interop->cgr[index] != NULL)
+        cudaGraphicsUnregisterResource(interop->cgr[index]);
+    }
 
-  // render buffer object w/a color buffer
-  glDeleteRenderbuffers(1,&interop->rb0);
+  // destroy stream
+  cudaStreamDestroy(interop->stream);
 
-  // frame buffer object
-  glDeleteFramebuffers(1,&interop->fb0);
+  // delete rbo's
+  glDeleteRenderbuffers(interop->count,interop->rb);
+
+  // delete fbo's
+  glDeleteFramebuffers(interop->count,interop->fb);
 
   // free interop
   free(interop);
@@ -80,39 +108,53 @@ pxl_interop_destroy(struct pxl_interop* const interop)
 //
 
 cudaError_t
-pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height)
+pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height)
 {
+  cudaError_t cuda_err = cudaSuccess;
+
   // save new size
   interop->width  = width;
   interop->height = height;
 
   // resize color buffer
-  glNamedRenderbufferStorage(interop->rb0,GL_RGBA8,width,height);
-
-  //
-  // REGISTER RBO WITH CUDA
-  //
+  for (int index=0; index<interop->count; index++)
+    {
+      // unregister resource
+      if (interop->cgr[index] != NULL)
+        cuda_err = cudaGraphicsUnregisterResource(interop->cgr[index]);
+
+      // resize rbo
+      glNamedRenderbufferStorage(interop->rb[index],GL_RGBA8,width,height);
+
+      // cudaGraphicsResource_t cgr;      
+
+      // register rbo
+      cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr[index],
+                                             interop->rb[index],
+                                             GL_RENDERBUFFER,
+                                             cudaGraphicsRegisterFlagsSurfaceLoadStore | 
+                                             cudaGraphicsRegisterFlagsWriteDiscard);
+
+      // map graphics resource
+      cuda_err = cudaGraphicsMapResources(1,&interop->cgr[index],interop->stream);
+
+      // get a CUDA Array
+      cuda_err = cudaGraphicsSubResourceGetMappedArray(&interop->ca[index],
+                                                       interop->cgr[index],
+                                                       0,0);
+
+      // unmap graphics resource
+      cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr[index],interop->stream);
+
+      // probe fbo status
+      glCheckNamedFramebufferStatus(interop->fb[index],0);
+    }
 
-  cudaError_t cuda_err;
-
-  // unregister
-  if (interop->cgr0 != NULL)
-    cuda_err = cudaGraphicsUnregisterResource(interop->cgr0);
-
-  // register image
-  cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr0,
-                                         interop->rb0,
-                                         GL_RENDERBUFFER,
-                                         cudaGraphicsRegisterFlagsSurfaceLoadStore | cudaGraphicsRegisterFlagsWriteDiscard);
-
-  // diddle some flags
-  cuda_err = cudaGraphicsResourceSetMapFlags(interop->cgr0,cudaGraphicsMapFlagsWriteDiscard);
-
   return cuda_err;
 }
 
 void
-pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height)
+pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height)
 {
   *width  = interop->width;
   *height = interop->height;
@@ -122,38 +164,28 @@ pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* c
 //
 //
 
-cudaError_t
-pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
+cudaArray_const_t
+pxl_interop_array_get(struct pxl_interop* const interop)
 {
-  cudaError_t cuda_err;
-
-  // map graphics resources
-  cuda_err = cudaGraphicsMapResources(1,&interop->cgr0,stream);
-
-  return cuda_err;
+  return interop->ca[interop->index];
 }
 
-cudaError_t
-pxl_interop_get(struct pxl_interop* const interop, cudaArray_t* cuda_array)
+cudaStream_t
+pxl_interop_stream_get(struct pxl_interop* const interop)
 {
-  cudaError_t cuda_err;
-
-  // get a CUDA Array
-  cuda_err = cudaGraphicsSubResourceGetMappedArray(cuda_array,interop->cgr0,0,0);
-
-  return cuda_err;
+  return interop->stream;
 }
 
-cudaError_t
-pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
-{
-  cudaError_t cuda_err;
-
-  cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr0,stream);
+//
+//
+//
 
-  return cuda_err;
+void
+pxl_interop_swap(struct pxl_interop* const interop)
+{
+  interop->index = (interop->index + 1) % interop->count;
 }
-  
+
 //
 //
 //
@@ -169,8 +201,9 @@ pxl_interop_clear(struct pxl_interop* const interop)
   glClearNamedFramebufferuiv(interop->fb0,GL_COLOR,0,clear_color);
   */
 
-  const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
-  glInvalidateNamedFramebufferData(interop->fb0,1,attachments);
+  static const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
+
+  glInvalidateNamedFramebufferData(interop->fb[interop->index],1,attachments);
 }
 
 //
@@ -180,7 +213,7 @@ pxl_interop_clear(struct pxl_interop* const interop)
 void
 pxl_interop_blit(struct pxl_interop* const interop)
 {
-  glBlitNamedFramebuffer(interop->fb0,0,
+  glBlitNamedFramebuffer(interop->fb[interop->index],0,
                          0,0,              interop->width,interop->height,
                          0,interop->height,interop->width,0,
                          GL_COLOR_BUFFER_BIT,

diff --git a/interop.h b/interop.h
@@ -16,7 +16,7 @@
 //
 
 struct pxl_interop*
-pxl_interop_create();
+pxl_interop_create(const int fbo_count);
 
 void
 pxl_interop_destroy(struct pxl_interop* const interop);
@@ -26,28 +26,28 @@ pxl_interop_destroy(struct pxl_interop* const interop);
 //
 
 cudaError_t
-pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height);
+pxl_interop_size_set(struct pxl_interop* const interop, const int width, const int height);
 
 void
-pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height);
+pxl_interop_size_get(struct pxl_interop* const interop, int* const width, int* const height);
 
 //
 //
 //
 
-cudaError_t
-pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);
-
-cudaError_t
-pxl_interop_get(struct pxl_interop* const interop, cudaArray_t* cuda_array);
+cudaStream_t
+pxl_interop_stream_get(struct pxl_interop* const interop);
 
-cudaError_t
-pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);
+cudaArray_const_t
+pxl_interop_array_get(struct pxl_interop* const interop);
 
 //
 //
 //
 
+void
+pxl_interop_swap(struct pxl_interop* const interop);
+
 void
 pxl_interop_clear(struct pxl_interop* const interop);
 

diff --git a/main.c b/main.c
@@ -145,7 +145,7 @@ pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
   // get context
   struct pxl_interop* const interop = glfwGetWindowUserPointer(window);
 
-  pxl_interop_resize(interop,width,height);
+  pxl_interop_size_set(interop,width,height);
 }
 
 //
@@ -206,7 +206,7 @@ main(int argc, char* argv[])
   // CREATE INTEROP
   //
 
-  struct pxl_interop* const interop = pxl_interop_create();
+  struct pxl_interop* const interop = pxl_interop_create(2);
 
   //
   // RESIZE INTEROP
@@ -218,7 +218,7 @@ main(int argc, char* argv[])
   glfwGetFramebufferSize(window,&width,&height);
 
   // resize with initial window dimensions
-  cuda_err = pxl_interop_resize(interop,width,height);
+  cuda_err = pxl_interop_size_set(interop,width,height);
 
   //
   // SET USER POINTER AND CALLBACKS
@@ -244,20 +244,15 @@ main(int argc, char* argv[])
       // EXECUTE CUDA KERNEL ON RENDER BUFFER
       //
 
-      int         width,height;
-      cudaArray_t cuda_array;
+      int width,height;
 
-      pxl_interop_get_size(interop,&width,&height);
+      pxl_interop_size_get(interop,&width,&height);
 
-      cuda_err = pxl_interop_map(interop,stream);
+      cuda_err = pxl_kernel_launcher(pxl_interop_array_get(interop),
+                                     width,height,
+                                     pxl_interop_stream_get(interop));
 
-      cuda_err = pxl_interop_get(interop,&cuda_array);
-
-      cuda_err = pxl_kernel_launcher(cuda_array,width,height,stream);
-
-      cuda_err = pxl_interop_unmap(interop,stream);
-
-      cuda_err = cudaStreamSynchronize(stream);
+      // cuda_err = cudaStreamSynchronize(stream);
 
       //
       // BLIT
@@ -269,6 +264,8 @@ main(int argc, char* argv[])
       // SWAP
       //
 
+      pxl_interop_swap(interop);
+
       glfwSwapBuffers(window);
 
       //

diff --git a/main.c b/main.c
@@ -95,8 +95,6 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)
   if (!glfwInit())
     exit(EXIT_FAILURE);
 
-  const GLFWvidmode* mode = glfwGetVideoMode(glfwGetPrimaryMonitor());
-
   glfwWindowHint(GLFW_DEPTH_BITS,            0);
   glfwWindowHint(GLFW_STENCIL_BITS,          0);
 
@@ -107,7 +105,13 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)
 
   glfwWindowHint(GLFW_OPENGL_PROFILE,        GLFW_OPENGL_CORE_PROFILE);
 
+#ifdef PXL_FULLSCREEN
+  GLFWmonitor*       monitor = glfwGetPrimaryMonitor();
+  const GLFWvidmode* mode    = glfwGetVideoMode(monitor);
+  *window                    = glfwCreateWindow(mode->width,mode->height,"GLFW / CUDA Interop",monitor,NULL);
+#else
   *window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL);
+#endif
 
   if (*window == NULL)
     {

diff --git a/kernel.cu b/kernel.cu
@@ -29,13 +29,14 @@ union pxl_rgbx_24
 extern "C"
 __global__
 void
-pxl_kernel(const int width)
+pxl_kernel(const int width, const int height)
 {
   // pixel coordinates
   const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
   const int x   = idx % width;
   const int y   = idx / width;
 
+#if 1
   // pixel color
   const int          t    = (unsigned int)clock() / 1250000; // 1.25 GHz
   const int          xt   = (idx + t) % width;
@@ -49,12 +50,18 @@ pxl_kernel(const int width)
   rgbx.b  = (bar == 0) || (bar == 3) ? ramp : 0;
   rgbx.na = 255;
 
-  // cudaBoundaryModeZero squelches out-of-bound writes
+#else // DRAW A RED BORDER TO VALIDATE FLIPPED BLIT
+
+  const bool        border = (x == 0) || (x == width-1) || (y == 0) || (y == height-1);
+  union pxl_rgbx_24 rgbx   = { border ? 0xFF0000FF : 0xFF000000 };
+  
+#endif
+
   surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
-              surf,
-              x*sizeof(rgbx),
-              y,
-              cudaBoundaryModeZero);
+    surf,
+    x*sizeof(rgbx),
+    y,
+    cudaBoundaryModeZero); // squelches out-of-bound writes
 }
 
 //
@@ -78,7 +85,7 @@ pxl_kernel_launcher(cudaArray_const_t array,
   const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;
 
   if (blocks > 0)
-    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);
+    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width,height);
 
   return cudaSuccess;
 }

diff --git a/interop.c b/interop.c
@@ -6,6 +6,7 @@
 #include <glad/glad.h>
 #include <GLFW/glfw3.h>
 #include <cuda_gl_interop.h>
+#include <stdlib.h>
 
 //
 //

diff --git a/interop.c b/interop.c
@@ -3,13 +3,15 @@
 //
 //
 
-#include "interop.h"
+#include <glad/glad.h>
+#include <GLFW/glfw3.h>
+#include <cuda_gl_interop.h>
 
 //
 //
 //
 
-#include <cuda_gl_interop.h>
+#include "interop.h"
 
 //
 //

diff --git a/interop.h b/interop.h
@@ -9,8 +9,6 @@
 //
 //
 
-#include <glad/glad.h>
-#include <GLFW/glfw3.h>
 #include <cuda_runtime.h>
 
 //

diff --git a/interop.c b/interop.c
@@ -3,13 +3,13 @@
 //
 //
 
-#include <stdlib.h>
+#include "interop.h"
 
 //
 //
 //
 
-#include "interop.h"
+#include <cuda_gl_interop.h>
 
 //
 //

diff --git a/interop.h b/interop.h
@@ -11,13 +11,7 @@
 
 #include <glad/glad.h>
 #include <GLFW/glfw3.h>
-
-//
-//
-//
-
 #include <cuda_runtime.h>
-#include <cuda_gl_interop.h>
 
 //
 //

diff --git a/main.c b/main.c
@@ -16,7 +16,6 @@
 //
 //
 
-#include <cuda_runtime.h>
 #include <cuda_gl_interop.h>
 
 //

diff --git a/interop.c b/interop.c
@@ -34,18 +34,21 @@ struct pxl_interop
 //
 
 struct pxl_interop*
-pxl_interop_create(GLFWwindow* window)
+pxl_interop_create()
 {
   struct pxl_interop* const interop = (struct pxl_interop*)malloc(sizeof(struct pxl_interop));
 
   // init cuda graphics resource
   interop->cgr0 = NULL;
 
   // render buffer object w/a color buffer
-  glGenRenderbuffers(1,&interop->rb0);
+  glCreateRenderbuffers(1,&interop->rb0);
 
   // frame buffer object
-  glGenFramebuffers(1,&interop->fb0);
+  glCreateFramebuffers(1,&interop->fb0);
+
+  // attach rb0 to the fb0
+  glNamedFramebufferRenderbuffer(interop->fb0,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);
 
   // return it
   return interop;
@@ -55,6 +58,17 @@ pxl_interop_create(GLFWwindow* window)
 void
 pxl_interop_destroy(struct pxl_interop* const interop)
 {
+  // unregister CUDA resource
+  if (interop->cgr0 != NULL)
+    cudaGraphicsUnregisterResource(interop->cgr0);
+
+  // render buffer object w/a color buffer
+  glDeleteRenderbuffers(1,&interop->rb0);
+
+  // frame buffer object
+  glDeleteFramebuffers(1,&interop->fb0);
+
+  // free interop
   free(interop);
 }
 
@@ -69,26 +83,9 @@ pxl_interop_resize(struct pxl_interop* const interop, const int width, const int
   interop->width  = width;
   interop->height = height;
 
-  //
-  // RESIZE FBO'S COLOR BUFFER
-  //
-
-  // bind rbo
-  glBindRenderbuffer   (GL_RENDERBUFFER,interop->rb0);
   // resize color buffer
-  glRenderbufferStorage(GL_RENDERBUFFER,GL_RGBA8,width,height);
-
-  // bind fbo to read
-  glBindFramebuffer        (GL_FRAMEBUFFER,interop->fb0);
-  // attach rb0 to the fb0
-  glFramebufferRenderbuffer(GL_FRAMEBUFFER,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);
-
-  // unbind rbo
-  glBindRenderbuffer(GL_RENDERBUFFER,0);
+  glNamedRenderbufferStorage(interop->rb0,GL_RGBA8,width,height);
 
-  // bind to default fb
-  glBindFramebuffer(GL_FRAMEBUFFER,0);
-
   //
   // REGISTER RBO WITH CUDA
   //
@@ -180,13 +177,6 @@ pxl_interop_clear(struct pxl_interop* const interop)
 void
 pxl_interop_blit(struct pxl_interop* const interop)
 {
-  /*
-  glBlitFramebuffer(0,0,              interop->width,interop->height,
-                    0,interop->height,interop->width,0,
-                    GL_COLOR_BUFFER_BIT,
-                    GL_NEAREST);
-  */
-
   glBlitNamedFramebuffer(interop->fb0,0,
                          0,0,              interop->width,interop->height,
                          0,interop->height,interop->width,0,

diff --git a/main.c b/main.c
@@ -124,11 +124,11 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)
   // ignore vsync for now
   glfwSwapInterval(0);
 
+  // only copy r/g/b
+  glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
+
   // enable SRGB 
   // glEnable(GL_FRAMEBUFFER_SRGB);
-
-  // only copy r/g/b
-  // glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
 }
 
 //
@@ -200,22 +200,13 @@ main(int argc, char* argv[])
   cuda_err = cudaStreamCreate(&stream);
 
   //
-  // CREATE AND SAVE INTEROP INSTANCE
+  // CREATE INTEROP
   //
 
-  struct pxl_interop* const interop = pxl_interop_create(window);
-
-  glfwSetWindowUserPointer(window,interop);
+  struct pxl_interop* const interop = pxl_interop_create();
 
   //
-  // SET CALLBACKS
-  //
-
-  glfwSetKeyCallback            (window,pxl_glfw_key_callback);
-  glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
-
-  //
-  // GET ACTUAL WINDOW SIZE
+  // RESIZE INTEROP
   //
 
   int width, height;
@@ -226,6 +217,14 @@ main(int argc, char* argv[])
   // resize with initial window dimensions
   cuda_err = pxl_interop_resize(interop,width,height);
 
+  //
+  // SET USER POINTER AND CALLBACKS
+  //
+
+  glfwSetWindowUserPointer      (window,interop);
+  glfwSetKeyCallback            (window,pxl_glfw_key_callback);
+  glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
+
   //
   // LOOP UNTIL DONE
   //

diff --git a/interop.h b/interop.h
@@ -1,8 +1,7 @@
 
-/*
- * Copyright 2015 Allan MacKinnon.  All rights reserved.
- *
- */
+//
+//
+//
 
 #pragma once
 

diff --git a/interop.c b/interop.c
@@ -123,13 +123,21 @@ pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* c
 //
 
 cudaError_t
-pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream)
+pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream)
 {
   cudaError_t cuda_err;
 
   // map graphics resources
   cuda_err = cudaGraphicsMapResources(1,&interop->cgr0,stream);
 
+  return cuda_err;
+}
+
+cudaError_t
+pxl_interop_get(struct pxl_interop* const interop, cudaArray_t* cuda_array)
+{
+  cudaError_t cuda_err;
+
   // get a CUDA Array
   cuda_err = cudaGraphicsSubResourceGetMappedArray(cuda_array,interop->cgr0,0,0);
 
@@ -188,4 +196,4 @@ pxl_interop_blit(struct pxl_interop* const interop)
 
 //
 //
-//
+//
diff --git a/interop.h b/interop.h
@@ -45,7 +45,10 @@ pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* c
 //
 
 cudaError_t
-pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream);
+pxl_interop_map(struct pxl_interop* const interop, cudaStream_t stream);
+
+cudaError_t
+pxl_interop_get(struct pxl_interop* const interop, cudaArray_t* cuda_array);
 
 cudaError_t
 pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);
@@ -62,4 +65,4 @@ pxl_interop_blit(struct pxl_interop* const interop);
 
 //
 //
-//
+//
diff --git a/main.c b/main.c
@@ -247,7 +247,9 @@ main(int argc, char* argv[])
 
       pxl_interop_get_size(interop,&width,&height);
 
-      cuda_err = pxl_interop_map(interop,&cuda_array,stream);
+      cuda_err = pxl_interop_map(interop,stream);
+
+      cuda_err = pxl_interop_get(interop,&cuda_array);
 
       cuda_err = pxl_kernel_launcher(cuda_array,width,height,stream);
 

diff --git a/kernel.cu b/kernel.cu
@@ -61,7 +61,7 @@ pxl_kernel(const int width)
 //
 //
 
-#define PXL_KERNEL_THREADS_PER_BLOCK  64
+#define PXL_KERNEL_THREADS_PER_BLOCK  256 // enough for 4Kx2 monitor
 
 extern "C"
 cudaError_t
@@ -77,7 +77,8 @@ pxl_kernel_launcher(cudaArray_const_t array,
 
   const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;
 
-  pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);
+  if (blocks > 0)
+    pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);
 
   return cudaSuccess;
 }

diff --git a/main.c b/main.c
@@ -167,13 +167,6 @@ main(int argc, char* argv[])
 
   pxl_glfw_init(&window,1024,1024);
 
-  //
-  // SET CALLBACKS
-  //
-
-  glfwSetKeyCallback            (window,pxl_glfw_key_callback);
-  glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
-
   //
   // INIT CUDA
   //
@@ -214,6 +207,13 @@ main(int argc, char* argv[])
 
   glfwSetWindowUserPointer(window,interop);
 
+  //
+  // SET CALLBACKS
+  //
+
+  glfwSetKeyCallback            (window,pxl_glfw_key_callback);
+  glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
+
   //
   // GET ACTUAL WINDOW SIZE
   //

diff --git a/kernel.cu b/kernel.cu
@@ -26,6 +26,7 @@ union pxl_rgbx_24
 //
 //
 
+extern "C"
 __global__
 void
 pxl_kernel(const int width)
@@ -36,8 +37,10 @@ pxl_kernel(const int width)
   const int y   = idx / width;
 
   // pixel color
-  const unsigned int ramp = (unsigned int)(((float)x / (float)(width-1)) * 255.0f + 0.5f);
-  const unsigned int bar  = (y / 32) & 3;
+  const int          t    = (unsigned int)clock() / 1250000; // 1.25 GHz
+  const int          xt   = (idx + t) % width;
+  const unsigned int ramp = (unsigned int)(((float)xt / (float)(width-1)) * 255.0f + 0.5f);
+  const unsigned int bar  = ((y + t) / 32) & 3;
 
   union pxl_rgbx_24  rgbx;
 
@@ -81,4 +84,4 @@ pxl_kernel_launcher(cudaArray_const_t array,
 
 //
 //
-//
+//
diff --git a/main.c b/main.c
@@ -125,10 +125,10 @@ pxl_glfw_init(GLFWwindow** window, const int width, const int height)
   glfwSwapInterval(0);
 
   // enable SRGB 
-  glEnable(GL_FRAMEBUFFER_SRGB);
+  // glEnable(GL_FRAMEBUFFER_SRGB);
 
   // only copy r/g/b
-  glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
+  // glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
 }
 
 //
@@ -293,4 +293,4 @@ main(int argc, char* argv[])
 
 //
 //
-//
+//
diff --git a/interop.c b/interop.c
@@ -0,0 +1,191 @@
+
+//
+//
+//
+
+#include <stdlib.h>
+
+//
+//
+//
+
+#include "interop.h"
+
+//
+//
+//
+
+struct pxl_interop
+{
+  // w x h
+  int                    width;
+  int                    height;
+
+  // GL buffers
+  GLuint                 fb0;
+  GLuint                 rb0;
+
+  // CUDA resources
+  cudaGraphicsResource_t cgr0;
+};
+
+//
+//
+//
+
+struct pxl_interop*
+pxl_interop_create(GLFWwindow* window)
+{
+  struct pxl_interop* const interop = (struct pxl_interop*)malloc(sizeof(struct pxl_interop));
+
+  // init cuda graphics resource
+  interop->cgr0 = NULL;
+
+  // render buffer object w/a color buffer
+  glGenRenderbuffers(1,&interop->rb0);
+
+  // frame buffer object
+  glGenFramebuffers(1,&interop->fb0);
+
+  // return it
+  return interop;
+}
+
+
+void
+pxl_interop_destroy(struct pxl_interop* const interop)
+{
+  free(interop);
+}
+
+//
+//
+//
+
+cudaError_t
+pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height)
+{
+  // save new size
+  interop->width  = width;
+  interop->height = height;
+
+  //
+  // RESIZE FBO'S COLOR BUFFER
+  //
+
+  // bind rbo
+  glBindRenderbuffer   (GL_RENDERBUFFER,interop->rb0);
+  // resize color buffer
+  glRenderbufferStorage(GL_RENDERBUFFER,GL_RGBA8,width,height);
+
+  // bind fbo to read
+  glBindFramebuffer        (GL_FRAMEBUFFER,interop->fb0);
+  // attach rb0 to the fb0
+  glFramebufferRenderbuffer(GL_FRAMEBUFFER,GL_COLOR_ATTACHMENT0,GL_RENDERBUFFER,interop->rb0);
+
+  // unbind rbo
+  glBindRenderbuffer(GL_RENDERBUFFER,0);
+
+  // bind to default fb
+  glBindFramebuffer(GL_FRAMEBUFFER,0);
+
+  //
+  // REGISTER RBO WITH CUDA
+  //
+
+  cudaError_t cuda_err;
+
+  // unregister
+  if (interop->cgr0 != NULL)
+    cuda_err = cudaGraphicsUnregisterResource(interop->cgr0);
+
+  // register image
+  cuda_err = cudaGraphicsGLRegisterImage(&interop->cgr0,
+                                         interop->rb0,
+                                         GL_RENDERBUFFER,
+                                         cudaGraphicsRegisterFlagsSurfaceLoadStore | cudaGraphicsRegisterFlagsWriteDiscard);
+
+  // diddle some flags
+  cuda_err = cudaGraphicsResourceSetMapFlags(interop->cgr0,cudaGraphicsMapFlagsWriteDiscard);
+
+  return cuda_err;
+}
+
+void
+pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height)
+{
+  *width  = interop->width;
+  *height = interop->height;
+}
+
+//
+//
+//
+
+cudaError_t
+pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream)
+{
+  cudaError_t cuda_err;
+
+  // map graphics resources
+  cuda_err = cudaGraphicsMapResources(1,&interop->cgr0,stream);
+
+  // get a CUDA Array
+  cuda_err = cudaGraphicsSubResourceGetMappedArray(cuda_array,interop->cgr0,0,0);
+
+  return cuda_err;
+}
+
+cudaError_t
+pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream)
+{
+  cudaError_t cuda_err;
+
+  cuda_err = cudaGraphicsUnmapResources(1,&interop->cgr0,stream);
+
+  return cuda_err;
+}
+
+//
+//
+//
+
+void
+pxl_interop_clear(struct pxl_interop* const interop)
+{
+  /*
+  const GLenum draw_buffer[] = { GL_COLOR_ATTACHMENT0 };
+  const GLuint clear_color[] = { 255, 0, 0, 255 };
+                       
+  glNamedFramebufferDrawBuffers(interop->fb0,1,draw_buffer);
+  glClearNamedFramebufferuiv(interop->fb0,GL_COLOR,0,clear_color);
+  */
+
+  const GLenum attachments[] = { GL_COLOR_ATTACHMENT0 };
+  glInvalidateNamedFramebufferData(interop->fb0,1,attachments);
+}
+
+//
+//
+//
+
+void
+pxl_interop_blit(struct pxl_interop* const interop)
+{
+  /*
+  glBlitFramebuffer(0,0,              interop->width,interop->height,
+                    0,interop->height,interop->width,0,
+                    GL_COLOR_BUFFER_BIT,
+                    GL_NEAREST);
+  */
+
+  glBlitNamedFramebuffer(interop->fb0,0,
+                         0,0,              interop->width,interop->height,
+                         0,interop->height,interop->width,0,
+                         GL_COLOR_BUFFER_BIT,
+                         GL_NEAREST);
+}
+
+//
+//
+//
diff --git a/interop.h b/interop.h
@@ -0,0 +1,65 @@
+
+/*
+ * Copyright 2015 Allan MacKinnon.  All rights reserved.
+ *
+ */
+
+#pragma once
+
+//
+//
+//
+
+#include <glad/glad.h>
+#include <GLFW/glfw3.h>
+
+//
+//
+//
+
+#include <cuda_runtime.h>
+#include <cuda_gl_interop.h>
+
+//
+//
+//
+
+struct pxl_interop*
+pxl_interop_create();
+
+void
+pxl_interop_destroy(struct pxl_interop* const interop);
+
+//
+//
+//
+
+cudaError_t
+pxl_interop_resize(struct pxl_interop* const interop, const int width, const int height);
+
+void
+pxl_interop_get_size(struct pxl_interop* const interop, int* const width, int* const height);
+
+//
+//
+//
+
+cudaError_t
+pxl_interop_map(struct pxl_interop* const interop, cudaArray_t* cuda_array, cudaStream_t stream);
+
+cudaError_t
+pxl_interop_unmap(struct pxl_interop* const interop, cudaStream_t stream);
+
+//
+//
+//
+
+void
+pxl_interop_clear(struct pxl_interop* const interop);
+
+void
+pxl_interop_blit(struct pxl_interop* const interop);
+
+//
+//
+//
diff --git a/kernel.cu b/kernel.cu
@@ -0,0 +1,84 @@
+// -*- compile-command: "nvcc -m 32 -arch sm_30 -Xptxas=-v -cubin kernel.cu"; -*-
+
+//
+//
+//
+
+surface<void,cudaSurfaceType2D> surf;
+
+//
+//
+//
+
+union pxl_rgbx_24
+{
+  uint1       b32;
+
+  struct {
+    unsigned  r  : 8;
+    unsigned  g  : 8;
+    unsigned  b  : 8;
+    unsigned  na : 8;
+  };
+};
+
+//
+//
+//
+
+__global__
+void
+pxl_kernel(const int width)
+{
+  // pixel coordinates
+  const int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
+  const int x   = idx % width;
+  const int y   = idx / width;
+
+  // pixel color
+  const unsigned int ramp = (unsigned int)(((float)x / (float)(width-1)) * 255.0f + 0.5f);
+  const unsigned int bar  = (y / 32) & 3;
+
+  union pxl_rgbx_24  rgbx;
+
+  rgbx.r  = (bar == 0) || (bar == 1) ? ramp : 0;
+  rgbx.g  = (bar == 0) || (bar == 2) ? ramp : 0;
+  rgbx.b  = (bar == 0) || (bar == 3) ? ramp : 0;
+  rgbx.na = 255;
+
+  // cudaBoundaryModeZero squelches out-of-bound writes
+  surf2Dwrite(rgbx.b32, // even simpler: (unsigned int)clock()
+              surf,
+              x*sizeof(rgbx),
+              y,
+              cudaBoundaryModeZero);
+}
+
+//
+//
+//
+
+#define PXL_KERNEL_THREADS_PER_BLOCK  64
+
+extern "C"
+cudaError_t
+pxl_kernel_launcher(cudaArray_const_t array,
+                    const int         width,
+                    const int         height,
+                    cudaStream_t      stream)
+{
+  cudaError_t cuda_err = cudaBindSurfaceToArray(surf,array);
+
+  if (cuda_err)
+    return cuda_err;
+
+  const int blocks = (width * height + PXL_KERNEL_THREADS_PER_BLOCK - 1) / PXL_KERNEL_THREADS_PER_BLOCK;
+
+  pxl_kernel<<<blocks,PXL_KERNEL_THREADS_PER_BLOCK,0,stream>>>(width);
+
+  return cudaSuccess;
+}
+
+//
+//
+//
diff --git a/main.c b/main.c
@@ -0,0 +1,296 @@
+//
+//
+//
+
+#include <glad/glad.h>
+#include <GLFW/glfw3.h>
+
+//
+//
+//
+
+#include <stdlib.h>
+#include <stdio.h>
+
+//
+//
+//
+
+#include <cuda_runtime.h>
+#include <cuda_gl_interop.h>
+
+//
+//
+//
+
+#include "interop.h"
+
+//
+// FPS COUNTER FROM HERE:
+//
+// http://antongerdelan.net/opengl/glcontext2.html
+//
+
+static
+void
+pxl_glfw_fps(GLFWwindow* window)
+{
+  // static fps counters
+  static double stamp_prev  = 0.0;
+  static int    frame_count = 0;
+
+  // locals
+  const double stamp_curr = glfwGetTime();
+  const double elapsed    = stamp_curr - stamp_prev;
+
+  if (elapsed > 0.5)
+    {
+      stamp_prev = stamp_curr;
+
+      const double fps = (double)frame_count / elapsed;
+
+      int  width, height;
+      char tmp[64];
+
+      glfwGetFramebufferSize(window,&width,&height);
+
+      sprintf_s(tmp,64,"(%u x %u) - FPS: %.2f",width,height,fps);
+
+      glfwSetWindowTitle(window,tmp);
+
+      frame_count = 0;
+    }
+
+  frame_count++;
+}
+
+//
+//
+//
+
+static
+void
+pxl_glfw_error_callback(int error, const char* description)
+{
+  fputs(description,stderr);
+}
+
+static
+void
+pxl_glfw_key_callback(GLFWwindow* window, int key, int scancode, int action, int mods)
+{
+  if (key == GLFW_KEY_ESCAPE && action == GLFW_PRESS)
+    glfwSetWindowShouldClose(window, GL_TRUE);
+}
+
+static
+void
+pxl_glfw_init(GLFWwindow** window, const int width, const int height)
+{
+  //
+  // INITIALIZE GLFW/GLAD
+  //
+
+  glfwSetErrorCallback(pxl_glfw_error_callback);
+
+  if (!glfwInit())
+    exit(EXIT_FAILURE);
+
+  const GLFWvidmode* mode = glfwGetVideoMode(glfwGetPrimaryMonitor());
+
+  glfwWindowHint(GLFW_DEPTH_BITS,            0);
+  glfwWindowHint(GLFW_STENCIL_BITS,          0);
+
+  glfwWindowHint(GLFW_SRGB_CAPABLE,          GL_TRUE);
+
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
+  glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 5);
+
+  glfwWindowHint(GLFW_OPENGL_PROFILE,        GLFW_OPENGL_CORE_PROFILE);
+
+  *window = glfwCreateWindow(width,height,"GLFW / CUDA Interop",NULL,NULL);
+
+  if (*window == NULL)
+    {
+      glfwTerminate();
+      exit(EXIT_FAILURE);
+    }
+
+  glfwMakeContextCurrent(*window);
+
+  // set up GLAD
+  gladLoadGLLoader((GLADloadproc)glfwGetProcAddress);
+
+  // ignore vsync for now
+  glfwSwapInterval(0);
+
+  // enable SRGB 
+  glEnable(GL_FRAMEBUFFER_SRGB);
+
+  // only copy r/g/b
+  glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_FALSE);
+}
+
+//
+//
+//
+
+static
+void
+pxl_glfw_window_size_callback(GLFWwindow* window, int width, int height)
+{
+  // get context
+  struct pxl_interop* const interop = glfwGetWindowUserPointer(window);
+
+  pxl_interop_resize(interop,width,height);
+}
+
+//
+//
+//
+
+cudaError_t
+pxl_kernel_launcher(cudaArray_const_t array, const int width, const int height, cudaStream_t stream);
+
+//
+//
+//
+
+int
+main(int argc, char* argv[])
+{
+  //
+  // INIT GLFW
+  //
+
+  GLFWwindow* window;
+
+  pxl_glfw_init(&window,1024,1024);
+
+  //
+  // SET CALLBACKS
+  //
+
+  glfwSetKeyCallback            (window,pxl_glfw_key_callback);
+  glfwSetFramebufferSizeCallback(window,pxl_glfw_window_size_callback);
+
+  //
+  // INIT CUDA
+  //
+  cudaError_t cuda_err;
+
+  int gl_device_id,gl_device_count;
+
+  cuda_err = cudaGLGetDevices(&gl_device_count,&gl_device_id,1,cudaGLDeviceListAll);
+
+  int cuda_device_id = (argc > 1) ? atoi(argv[1]) : gl_device_id;
+
+  cuda_err = cudaSetDevice(cuda_device_id);
+
+  //
+  // INFO
+  //
+  struct cudaDeviceProp props;
+
+  cuda_err = cudaGetDeviceProperties(&props,gl_device_id);
+  printf("GL   : %-24s (%2d)\n",props.name,props.multiProcessorCount);
+
+  cuda_err = cudaGetDeviceProperties(&props,cuda_device_id);
+  printf("CUDA : %-24s (%2d)\n",props.name,props.multiProcessorCount);
+
+  //
+  // CREATE A CUDA STREAM
+  //
+
+  cudaStream_t stream;
+
+  cuda_err = cudaStreamCreate(&stream);
+
+  //
+  // CREATE AND SAVE INTEROP INSTANCE
+  //
+
+  struct pxl_interop* const interop = pxl_interop_create(window);
+
+  glfwSetWindowUserPointer(window,interop);
+
+  //
+  // GET ACTUAL WINDOW SIZE
+  //
+
+  int width, height;
+
+  // get initial width/height
+  glfwGetFramebufferSize(window,&width,&height);
+
+  // resize with initial window dimensions
+  cuda_err = pxl_interop_resize(interop,width,height);
+
+  //
+  // LOOP UNTIL DONE
+  //
+
+  while (!glfwWindowShouldClose(window))
+    {
+      //
+      // MONITOR FPS
+      //
+
+      pxl_glfw_fps(window);
+
+      //
+      // EXECUTE CUDA KERNEL ON RENDER BUFFER
+      //
+
+      int         width,height;
+      cudaArray_t cuda_array;
+
+      pxl_interop_get_size(interop,&width,&height);
+
+      cuda_err = pxl_interop_map(interop,&cuda_array,stream);
+
+      cuda_err = pxl_kernel_launcher(cuda_array,width,height,stream);
+
+      cuda_err = pxl_interop_unmap(interop,stream);
+
+      cuda_err = cudaStreamSynchronize(stream);
+
+      //
+      // BLIT
+      // 
+
+      pxl_interop_blit(interop);
+
+      //
+      // SWAP
+      //
+
+      glfwSwapBuffers(window);
+
+      //
+      // PUMP/POLL/WAIT
+      //
+
+      glfwPollEvents(); // glfwWaitEvents();
+    }
+
+  //
+  // CLEANUP
+  //
+
+  pxl_interop_destroy(interop);
+
+  glfwDestroyWindow(window);
+
+  glfwTerminate();
+
+  cudaDeviceReset();
+
+  // missing some clean up here
+
+  exit(EXIT_SUCCESS);
+}
+
+//
+//
+//
No results found