mfwhitman · August 29, 2015 13:57
diff --git a/UdaToneMap.cu b/UdaToneMap.cu
 /* Udacity Homework 3
   HDR Tone-mapping

  Background HDR
  ==============

  A High Definition Range (HDR) image contains a wider variation of intensity
  and color than is allowed by the RGB format with 1 byte per channel that we
  have used in the previous assignment.  

  To store this extra information we use single precision floating point for
  each channel.  This allows for an extremely wide range of intensity values.

  In the image for this assignment, the inside of church with light coming in
  through stained glass windows, the raw input floating point values for the
  channels range from 0 to 275.  But the mean is .41 and 98% of the values are
  less than 3!  This means that certain areas (the windows) are extremely bright
  compared to everywhere else.  If we linearly map this [0-275] range into the
  [0-255] range that we have been using then most values will be mapped to zero!
  The only thing we will be able to see are the very brightest areas - the
  windows - everything else will appear pitch black.

  The problem is that although we have cameras capable of recording the wide
  range of intensity that exists in the real world our monitors are not capable
  of displaying them.  Our eyes are also quite capable of observing a much wider
  range of intensities than our image formats / monitors are capable of
  displaying.

  Tone-mapping is a process that transforms the intensities in the image so that
  the brightest values aren't nearly so far away from the mean.  That way when
  we transform the values into [0-255] we can actually see the entire image.
  There are many ways to perform this process and it is as much an art as a
  science - there is no single "right" answer.  In this homework we will
  implement one possible technique.

  Background Chrominance-Luminance
  ================================

  The RGB space that we have been using to represent images can be thought of as
  one possible set of axes spanning a three dimensional space of color.  We
  sometimes choose other axes to represent this space because they make certain
  operations more convenient.

  Another possible way of representing a color image is to separate the color
  information (chromaticity) from the brightness information.  There are
  multiple different methods for doing this - a common one during the analog
  television days was known as Chrominance-Luminance or YUV.

  We choose to represent the image in this way so that we can remap only the
  intensity channel and then recombine the new intensity values with the color
  information to form the final image.

  Old TV signals used to be transmitted in this way so that black & white
  televisions could display the luminance channel while color televisions would
  display all three of the channels.
  

  Tone-mapping
  ============

  In this assignment we are going to transform the luminance channel (actually
  the log of the luminance, but this is unimportant for the parts of the
  algorithm that you will be implementing) by compressing its range to [0, 1].
  To do this we need the cumulative distribution of the luminance values.

  Example
  -------

  input : [2 4 3 3 1 7 4 5 7 0 9 4 3 2]
  min / max / range: 0 / 9 / 9

  histo with 3 bins: [4 7 3]

  cdf : [4 11 14]


  Your task is to calculate this cumulative distribution by following these
  steps.

 */

 #include "utils.h"
 #include <stdio.h>

 unsigned int nextPow2(unsigned int x)
 {
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
 }

 __global__ void minMaxOperation(float *d_min, float *d_max, const float *d_in, float *d_buffer, unsigned int len, unsigned int *d_retireCount)
 {
 	__shared__ bool isLast;
 	extern __shared__ float temp[];
 	float *t_min = (float*) temp;
 	float *t_max = (float*) &t_min[blockDim.x];
 	int gid = threadIdx.x + blockDim.x * blockIdx.x;
 	int tid = threadIdx.x;

 	//Populate shared memory
 	if (gid < len)
 	{
 		t_min[tid] = d_in[gid];
 		t_max[tid] = d_in[gid];
 	}
 	else
 	{
 		t_min[tid] = d_in[tid];
 		t_max[tid] = d_in[tid];
 	}

 	__threadfence_block();

 	//First phase of reduction, simultaneous min and max
 	for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1)
 	{
 		if (tid < s)
 		{
 			t_min[tid] = fminf(t_min[tid], t_min[tid+s]);
 			t_max[tid] = fmaxf(t_max[tid], t_max[tid+s]);
 		}
 		__threadfence_block();
 	}

 	//Each block places one element into global buffer
 	if (threadIdx.x == 0)
 	{
 		d_buffer[blockIdx.x] = t_min[0];
 		d_buffer[blockIdx.x + gridDim.x] = t_max[0];

 		//
 		unsigned int ticket = atomicInc(d_retireCount, gridDim.x);
 		isLast = (ticket == gridDim.x-1);
 		if (gridDim.x == 1) isLast = true;
 	}

 	__syncthreads();

 	//Last block gathers elements and does second phase reduction
 	if (isLast)
 	{
 		t_min[tid] = d_buffer[tid];

 		__threadfence_block();



 		for (unsigned int s = gridDim.x >> 1; s > 0; s >>= 1)
 		{
 			if (tid < s)
 				{
 				t_min[tid] = fminf(t_min[tid], t_min[tid+s]);
 				__threadfence_block();
 				}
 		}

 		if (threadIdx.x == 0)
 		{
 			d_min[0] = t_min[0];
 		}

 		t_max[tid] = d_buffer[tid + gridDim.x];

 		__threadfence_block();

 		for (unsigned int s = gridDim.x >> 1; s > 0; s >>= 1)
 		{
 			if (tid < s)
 				{
 				t_max[tid] = fmaxf(t_max[tid], t_max[tid+s]);
 				__threadfence_block();
 				}
 		}

 		if (threadIdx.x == 0)
 		{
 			d_max[0] = t_max[0];
 		}
 	}
 }


 __global__ void binPicker(unsigned int *d_out, const float *d_in, int len, float lumMin, float lumRange, int numBins)
 {
 	int globalId = threadIdx.x + blockDim.x * blockIdx.x;

 	if (globalId < len)
 	{
 		if (d_in[globalId] == lumMin)
 		{
 			d_out[globalId] = 0;
 		}
 		else
 		{
 		unsigned int temp = ((d_in[globalId] - lumMin) / lumRange) * numBins;
 		//if (temp >= 1024)
 		//	printf("Invalid value: %4.2f pos: %i bin: %u ", globalId, d_in[globalId], temp);
 		d_out[globalId] = --temp;
 		}
 	}
 }

 __global__ void histo_kernel_optimized5( unsigned int *buffer, int size, unsigned int *histo )
 {
   __shared__ unsigned int temp[1024];

    temp[threadIdx.x + 0] = 0;
    temp[threadIdx.x + 256] = 0;
    temp[threadIdx.x + 512] = 0;
    temp[threadIdx.x + 768] = 0;
    __syncthreads();

    int i = threadIdx.x + blockIdx.x * blockDim.x;
    int offset = blockDim.x * gridDim.x;
    while (i < size)
    {
    	if (i < size)
    		{
    		//unsigned int bt = ;
    		atomicAdd( &(temp[buffer[i]]), 1);
    		}
        i += offset;
    }
    __syncthreads();

   atomicAdd( &(histo[threadIdx.x + 0]), temp[threadIdx.x + 0] );
   atomicAdd( &(histo[threadIdx.x + 256]), temp[threadIdx.x + 256] );
   atomicAdd( &(histo[threadIdx.x + 512]), temp[threadIdx.x + 512] );
   atomicAdd( &(histo[threadIdx.x + 768]), temp[threadIdx.x + 768] );

 }

 /* Blelloch Scan
 * Currently runs as a single-block, monolithic.
 * Need to find out-of-bounds memory access.
 */
 __global__ void BlellochScan(unsigned int *d_out, unsigned int *d_in, size_t len)
 {
 	extern __shared__ unsigned int btemp[];
 	int tid = threadIdx.x;
 	int offset = 1;

 	if (tid < len)
 	{
 		btemp[2*tid] = d_in[2*tid];
 		btemp[2*tid+1] = d_in[2*tid+1];
 	}

 	for (int d = len >> 1; d > 0; d >>= 1)
 	{
 		__syncthreads();
 		if (tid < d)
 		{
 			int ai = offset * (2 * tid + 1) - 1;
 			int bi = offset * (2 * tid + 2) - 1;
 			btemp[bi] += btemp[ai];
 		}
 		offset *= 2;
 	}

 	__syncthreads();

 	if (tid == 0)
 	{
 		btemp[len-1] = 0;
 	}

 	for (int d = 1; d < len; d *= 2)
 	{
 		offset >>= 1;
 		__syncthreads();
 		if (tid < d)
 		{
 			int ai = offset * (2 * tid + 1) - 1;
 			int bi = offset * (2 * tid + 2) - 1;
 			int t = btemp[ai];
 			btemp[ai] = btemp[bi];
 			btemp[bi] += t;
 		}
 	}

 	__syncthreads();

 	if (tid < len)
 	{
 		d_out[2*tid] = btemp[2*tid];
 		d_out[2*tid+1] = btemp[2*tid+1];
 	}
 }

 void your_histogram_and_prefixsum(const float* const d_logLuminance,
                                  unsigned int* const d_cdf,
                                  float &min_logLum,
                                  float &max_logLum,
                                  const size_t numRows,
                                  const size_t numCols,
                                  const size_t numBins)
 {
  //TODO
  /*Here are the steps you need to implement
    1) find the minimum and maximum value in the input logLuminance channel
       store in min_logLum and max_logLum
    2) subtract them to find the range
    3) generate a histogram of all the values in the logLuminance channel using
       the formula: bin = (lum[i] - lumMin) / lumRange * numBins
    4) Perform an exclusive scan (prefix sum) on the histogram to get
       the cumulative distribution of luminance values (this should go in the
       incoming d_cdf pointer which already has been allocated for you)       */

 	//Declare pointers for device storage
 	float *d_bothPhase,
 		  *d_minimum,
 		  *d_maximum;

 	unsigned int *d_hBins,
 				 *d_retireCount,
 				 *d_buffer;

 	float range_logLum;

 	unsigned int numVals = numRows * numCols;
 	unsigned int threadsPerBlock = 1024;
 	unsigned int numBlocks = nextPow2(numVals / threadsPerBlock);

 	size_t binSize = (2 * numBins) * sizeof(unsigned int);

 	checkCudaErrors(cudaMalloc((void**)&d_bothPhase, sizeof(float) * 8 * threadsPerBlock));
 	checkCudaErrors(cudaMalloc((void**)&d_retireCount, sizeof(int)));
 	checkCudaErrors(cudaMemset(d_retireCount, 0, sizeof(int)));
 	//checkCudaErrors(cudaMemset(d_bothPhase, 0, numBlocks));

 	checkCudaErrors(cudaMalloc((void**)&d_minimum, sizeof(float)));
 	checkCudaErrors(cudaMemset(d_minimum, 0, sizeof(float)));
 	checkCudaErrors(cudaMalloc((void**)&d_maximum, sizeof(float)));
 	checkCudaErrors(cudaMemset(d_maximum, 0, sizeof(float)));

 	checkCudaErrors(cudaMalloc((void**)&d_hBins, sizeof(unsigned int) * numBins));
 	checkCudaErrors(cudaMemset((void*)d_hBins, 0, sizeof(unsigned int) * numBins));
 	checkCudaErrors(cudaMalloc((void**)&d_buffer, sizeof(unsigned int) * numVals));
 	checkCudaErrors(cudaMemset((void*)d_buffer, 0, sizeof(unsigned int) * numVals));

 	cudaDeviceSynchronize();

 	//-------------------------------------------------------------------------
 	cudaError_t error = cudaGetLastError();

 	if(error != cudaSuccess)
 	{
 		// print the CUDA error message and exit
 		printf("Mallocs and Memsets completed\n");
 		printf("CUDA error: %s\n", cudaGetErrorString(error));
 		exit(-1);
 	}
 	//-------------------------------------------------------------------------

 	//std::cout << std::endl;
 	//std::cout << "Threads per Block: " << threadsPerBlock << std::endl;
 	//std::cout << "Blocks: " << numBlocks << std::endl;
 	//std::cout << "numBins: " << numBins << std::endl;

 	minMaxOperation<<<numBlocks,threadsPerBlock, sizeof(float) * threadsPerBlock * 2>>>(d_minimum, d_maximum, d_logLuminance, d_bothPhase, numVals, d_retireCount);

 	cudaDeviceSynchronize();

 	//-------------------------------------------------------------------------
 	error = cudaGetLastError();

 	if(error != cudaSuccess)
 	{
 		// print the CUDA error message and exit
 		printf("Reduction stage completed\n");
 		printf("CUDA error: %s\n", cudaGetErrorString(error));
 		exit(-1);
 	}
 	//-------------------------------------------------------------------------

 	checkCudaErrors(cudaMemcpy((void*)&min_logLum, d_minimum, sizeof(float), cudaMemcpyDeviceToHost));
 	checkCudaErrors(cudaMemcpy((void*)&max_logLum, d_maximum, sizeof(float), cudaMemcpyDeviceToHost));

 	cudaDeviceSynchronize();

 	range_logLum = max_logLum - min_logLum;

 	//printf ("Min: %4.2f Max: %4.2f Range: %4.2f \n", min_logLum, max_logLum, range_logLum);

 	//std::cout << "Histogram: " << numVals << " Elements per " << (threadsPerBlock*numBlocks) << " threads" << std::endl;

 	binPicker<<<numBlocks, threadsPerBlock>>>(d_buffer, d_logLuminance, numVals, min_logLum, range_logLum, numBins);

 	cudaDeviceSynchronize();

 	//-------------------------------------------------------------------------
 	error = cudaGetLastError();

 	if(error != cudaSuccess)
 	{
 		// print the CUDA error message and exit
 		printf("Bin Selection stage completed\n");
 		printf("CUDA error: %s\n", cudaGetErrorString(error));
 		exit(-1);
 	}
 	//-------------------------------------------------------------------------

 	histo_kernel_optimized5<<<1,256>>>(d_buffer, numVals, d_hBins);

 	cudaDeviceSynchronize();

 	//-------------------------------------------------------------------------
 	error = cudaGetLastError();

 	if(error != cudaSuccess)
 	{
 		// print the CUDA error message and exit
 		printf("Histogram stage completed\n");
 		printf("CUDA error: %s\n", cudaGetErrorString(error));
 		exit(-1);
 	}
 	//-------------------------------------------------------------------------

 	cudaDeviceSynchronize();

 	BlellochScan<<<1, numBins/2, binSize>>>(d_cdf, d_hBins, numBins);

 	//cudaFree after the reductions.
 	checkCudaErrors(cudaFree(d_hBins));
 	checkCudaErrors(cudaFree(d_buffer));
 	checkCudaErrors(cudaFree(d_bothPhase));
 	checkCudaErrors(cudaFree(d_minimum));
 	checkCudaErrors(cudaFree(d_maximum));

 }
	/* Udacity Homework 3
	HDR Tone-mapping

	Background HDR
	==============

	A High Definition Range (HDR) image contains a wider variation of intensity
	and color than is allowed by the RGB format with 1 byte per channel that we
	have used in the previous assignment.

	To store this extra information we use single precision floating point for
	each channel. This allows for an extremely wide range of intensity values.

	In the image for this assignment, the inside of church with light coming in
	through stained glass windows, the raw input floating point values for the
	channels range from 0 to 275. But the mean is .41 and 98% of the values are
	less than 3! This means that certain areas (the windows) are extremely bright
	compared to everywhere else. If we linearly map this [0-275] range into the
	[0-255] range that we have been using then most values will be mapped to zero!
	The only thing we will be able to see are the very brightest areas - the
	windows - everything else will appear pitch black.

	The problem is that although we have cameras capable of recording the wide
	range of intensity that exists in the real world our monitors are not capable
	of displaying them. Our eyes are also quite capable of observing a much wider
	range of intensities than our image formats / monitors are capable of
	displaying.

	Tone-mapping is a process that transforms the intensities in the image so that
	the brightest values aren't nearly so far away from the mean. That way when
	we transform the values into [0-255] we can actually see the entire image.
	There are many ways to perform this process and it is as much an art as a
	science - there is no single "right" answer. In this homework we will
	implement one possible technique.

	Background Chrominance-Luminance
	================================

	The RGB space that we have been using to represent images can be thought of as
	one possible set of axes spanning a three dimensional space of color. We
	sometimes choose other axes to represent this space because they make certain
	operations more convenient.

	Another possible way of representing a color image is to separate the color
	information (chromaticity) from the brightness information. There are
	multiple different methods for doing this - a common one during the analog
	television days was known as Chrominance-Luminance or YUV.

	We choose to represent the image in this way so that we can remap only the
	intensity channel and then recombine the new intensity values with the color
	information to form the final image.

	Old TV signals used to be transmitted in this way so that black & white
	televisions could display the luminance channel while color televisions would
	display all three of the channels.


	Tone-mapping
	============

	In this assignment we are going to transform the luminance channel (actually
	the log of the luminance, but this is unimportant for the parts of the
	algorithm that you will be implementing) by compressing its range to [0, 1].
	To do this we need the cumulative distribution of the luminance values.

	Example
	-------

	input : [2 4 3 3 1 7 4 5 7 0 9 4 3 2]
	min / max / range: 0 / 9 / 9

	histo with 3 bins: [4 7 3]

	cdf : [4 11 14]


	Your task is to calculate this cumulative distribution by following these
	steps.

	*/

	#include "utils.h"
	#include <stdio.h>

	unsigned int nextPow2(unsigned int x)
	{
	--x;
	x \|= x >> 1;
	x \|= x >> 2;
	x \|= x >> 4;
	x \|= x >> 8;
	x \|= x >> 16;
	return ++x;
	}

	__global__ void minMaxOperation(float d_min, float d_max, const float d_in, float d_buffer, unsigned int len, unsigned int *d_retireCount)
	{
	__shared__ bool isLast;
	extern __shared__ float temp[];
	float t_min = (float) temp;
	float t_max = (float) &t_min[blockDim.x];
	int gid = threadIdx.x + blockDim.x * blockIdx.x;
	int tid = threadIdx.x;

	//Populate shared memory
	if (gid < len)
	{
	t_min[tid] = d_in[gid];
	t_max[tid] = d_in[gid];
	}
	else
	{
	t_min[tid] = d_in[tid];
	t_max[tid] = d_in[tid];
	}

	__threadfence_block();

	//First phase of reduction, simultaneous min and max
	for (unsigned int s = blockDim.x >> 1; s > 0; s >>= 1)
	{
	if (tid < s)
	{
	t_min[tid] = fminf(t_min[tid], t_min[tid+s]);
	t_max[tid] = fmaxf(t_max[tid], t_max[tid+s]);
	}
	__threadfence_block();
	}

	//Each block places one element into global buffer
	if (threadIdx.x == 0)
	{
	d_buffer[blockIdx.x] = t_min[0];
	d_buffer[blockIdx.x + gridDim.x] = t_max[0];

	//
	unsigned int ticket = atomicInc(d_retireCount, gridDim.x);
	isLast = (ticket == gridDim.x-1);
	if (gridDim.x == 1) isLast = true;
	}

	__syncthreads();

	//Last block gathers elements and does second phase reduction
	if (isLast)
	{
	t_min[tid] = d_buffer[tid];

	__threadfence_block();



	for (unsigned int s = gridDim.x >> 1; s > 0; s >>= 1)
	{
	if (tid < s)
	{
	t_min[tid] = fminf(t_min[tid], t_min[tid+s]);
	__threadfence_block();
	}
	}

	if (threadIdx.x == 0)
	{
	d_min[0] = t_min[0];
	}

	t_max[tid] = d_buffer[tid + gridDim.x];

	__threadfence_block();

	for (unsigned int s = gridDim.x >> 1; s > 0; s >>= 1)
	{
	if (tid < s)
	{
	t_max[tid] = fmaxf(t_max[tid], t_max[tid+s]);
	__threadfence_block();
	}
	}

	if (threadIdx.x == 0)
	{
	d_max[0] = t_max[0];
	}
	}
	}


	__global__ void binPicker(unsigned int d_out, const float d_in, int len, float lumMin, float lumRange, int numBins)
	{
	int globalId = threadIdx.x + blockDim.x * blockIdx.x;

	if (globalId < len)
	{
	if (d_in[globalId] == lumMin)
	{
	d_out[globalId] = 0;
	}
	else
	{
	unsigned int temp = ((d_in[globalId] - lumMin) / lumRange) * numBins;
	//if (temp >= 1024)
	// printf("Invalid value: %4.2f pos: %i bin: %u ", globalId, d_in[globalId], temp);
	d_out[globalId] = --temp;
	}
	}
	}

	__global__ void histo_kernel_optimized5( unsigned int buffer, int size, unsigned int histo )
	{
	__shared__ unsigned int temp[1024];

	temp[threadIdx.x + 0] = 0;
	temp[threadIdx.x + 256] = 0;
	temp[threadIdx.x + 512] = 0;
	temp[threadIdx.x + 768] = 0;
	__syncthreads();

	int i = threadIdx.x + blockIdx.x * blockDim.x;
	int offset = blockDim.x * gridDim.x;
	while (i < size)
	{
	if (i < size)
	{
	//unsigned int bt = ;
	atomicAdd( &(temp[buffer[i]]), 1);
	}
	i += offset;
	}
	__syncthreads();

	atomicAdd( &(histo[threadIdx.x + 0]), temp[threadIdx.x + 0] );
	atomicAdd( &(histo[threadIdx.x + 256]), temp[threadIdx.x + 256] );
	atomicAdd( &(histo[threadIdx.x + 512]), temp[threadIdx.x + 512] );
	atomicAdd( &(histo[threadIdx.x + 768]), temp[threadIdx.x + 768] );

	}

	/* Blelloch Scan
	* Currently runs as a single-block, monolithic.
	* Need to find out-of-bounds memory access.
	*/
	__global__ void BlellochScan(unsigned int d_out, unsigned int d_in, size_t len)
	{
	extern __shared__ unsigned int btemp[];
	int tid = threadIdx.x;
	int offset = 1;

	if (tid < len)
	{
	btemp[2tid] = d_in[2tid];
	btemp[2tid+1] = d_in[2tid+1];
	}

	for (int d = len >> 1; d > 0; d >>= 1)
	{
	__syncthreads();
	if (tid < d)
	{
	int ai = offset * (2 * tid + 1) - 1;
	int bi = offset * (2 * tid + 2) - 1;
	btemp[bi] += btemp[ai];
	}
	offset *= 2;
	}

	__syncthreads();

	if (tid == 0)
	{
	btemp[len-1] = 0;
	}

	for (int d = 1; d < len; d *= 2)
	{
	offset >>= 1;
	__syncthreads();
	if (tid < d)
	{
	int ai = offset * (2 * tid + 1) - 1;
	int bi = offset * (2 * tid + 2) - 1;
	int t = btemp[ai];
	btemp[ai] = btemp[bi];
	btemp[bi] += t;
	}
	}

	__syncthreads();

	if (tid < len)
	{
	d_out[2tid] = btemp[2tid];
	d_out[2tid+1] = btemp[2tid+1];
	}
	}

	void your_histogram_and_prefixsum(const float* const d_logLuminance,
	unsigned int* const d_cdf,
	float &min_logLum,
	float &max_logLum,
	const size_t numRows,
	const size_t numCols,
	const size_t numBins)
	{
	//TODO
	/*Here are the steps you need to implement
	1) find the minimum and maximum value in the input logLuminance channel
	store in min_logLum and max_logLum
	2) subtract them to find the range
	3) generate a histogram of all the values in the logLuminance channel using
	the formula: bin = (lum[i] - lumMin) / lumRange * numBins
	4) Perform an exclusive scan (prefix sum) on the histogram to get
	the cumulative distribution of luminance values (this should go in the
	incoming d_cdf pointer which already has been allocated for you) */

	//Declare pointers for device storage
	float *d_bothPhase,
	*d_minimum,
	*d_maximum;

	unsigned int *d_hBins,
	*d_retireCount,
	*d_buffer;

	float range_logLum;

	unsigned int numVals = numRows * numCols;
	unsigned int threadsPerBlock = 1024;
	unsigned int numBlocks = nextPow2(numVals / threadsPerBlock);

	size_t binSize = (2 * numBins) * sizeof(unsigned int);

	checkCudaErrors(cudaMalloc((void*)&d_bothPhase, sizeof(float) 8 * threadsPerBlock));
	checkCudaErrors(cudaMalloc((void**)&d_retireCount, sizeof(int)));
	checkCudaErrors(cudaMemset(d_retireCount, 0, sizeof(int)));
	//checkCudaErrors(cudaMemset(d_bothPhase, 0, numBlocks));

	checkCudaErrors(cudaMalloc((void**)&d_minimum, sizeof(float)));
	checkCudaErrors(cudaMemset(d_minimum, 0, sizeof(float)));
	checkCudaErrors(cudaMalloc((void**)&d_maximum, sizeof(float)));
	checkCudaErrors(cudaMemset(d_maximum, 0, sizeof(float)));

	checkCudaErrors(cudaMalloc((void*)&d_hBins, sizeof(unsigned int) numBins));
	checkCudaErrors(cudaMemset((void)d_hBins, 0, sizeof(unsigned int) numBins));
	checkCudaErrors(cudaMalloc((void*)&d_buffer, sizeof(unsigned int) numVals));
	checkCudaErrors(cudaMemset((void)d_buffer, 0, sizeof(unsigned int) numVals));

	cudaDeviceSynchronize();

	//-------------------------------------------------------------------------
	cudaError_t error = cudaGetLastError();

	if(error != cudaSuccess)
	{
	// print the CUDA error message and exit
	printf("Mallocs and Memsets completed\n");
	printf("CUDA error: %s\n", cudaGetErrorString(error));
	exit(-1);
	}
	//-------------------------------------------------------------------------

	//std::cout << std::endl;
	//std::cout << "Threads per Block: " << threadsPerBlock << std::endl;
	//std::cout << "Blocks: " << numBlocks << std::endl;
	//std::cout << "numBins: " << numBins << std::endl;

	minMaxOperation<<<numBlocks,threadsPerBlock, sizeof(float) * threadsPerBlock * 2>>>(d_minimum, d_maximum, d_logLuminance, d_bothPhase, numVals, d_retireCount);

	cudaDeviceSynchronize();

	//-------------------------------------------------------------------------
	error = cudaGetLastError();

	if(error != cudaSuccess)
	{
	// print the CUDA error message and exit
	printf("Reduction stage completed\n");
	printf("CUDA error: %s\n", cudaGetErrorString(error));
	exit(-1);
	}
	//-------------------------------------------------------------------------

	checkCudaErrors(cudaMemcpy((void*)&min_logLum, d_minimum, sizeof(float), cudaMemcpyDeviceToHost));
	checkCudaErrors(cudaMemcpy((void*)&max_logLum, d_maximum, sizeof(float), cudaMemcpyDeviceToHost));

	cudaDeviceSynchronize();

	range_logLum = max_logLum - min_logLum;

	//printf ("Min: %4.2f Max: %4.2f Range: %4.2f \n", min_logLum, max_logLum, range_logLum);

	//std::cout << "Histogram: " << numVals << " Elements per " << (threadsPerBlock*numBlocks) << " threads" << std::endl;

	binPicker<<<numBlocks, threadsPerBlock>>>(d_buffer, d_logLuminance, numVals, min_logLum, range_logLum, numBins);

	cudaDeviceSynchronize();

	//-------------------------------------------------------------------------
	error = cudaGetLastError();

	if(error != cudaSuccess)
	{
	// print the CUDA error message and exit
	printf("Bin Selection stage completed\n");
	printf("CUDA error: %s\n", cudaGetErrorString(error));
	exit(-1);
	}
	//-------------------------------------------------------------------------

	histo_kernel_optimized5<<<1,256>>>(d_buffer, numVals, d_hBins);

	cudaDeviceSynchronize();

	//-------------------------------------------------------------------------
	error = cudaGetLastError();

	if(error != cudaSuccess)
	{
	// print the CUDA error message and exit
	printf("Histogram stage completed\n");
	printf("CUDA error: %s\n", cudaGetErrorString(error));
	exit(-1);
	}
	//-------------------------------------------------------------------------

	cudaDeviceSynchronize();

	BlellochScan<<<1, numBins/2, binSize>>>(d_cdf, d_hBins, numBins);

	//cudaFree after the reductions.
	checkCudaErrors(cudaFree(d_hBins));
	checkCudaErrors(cudaFree(d_buffer));
	checkCudaErrors(cudaFree(d_bothPhase));
	checkCudaErrors(cudaFree(d_minimum));
	checkCudaErrors(cudaFree(d_maximum));

	}
No results found