// histogram.cl

__kernel
void GPU_histogram_static (__global int *in, int N, __global int *h)
{
	const int BINS = 256;
	int gloID = get_global_id(0);
	int locID = get_local_id(0);
	int GRIDSIZE = get_global_size(0);
	int localSize = get_local_size(0);
	__local int localH[256]; // BINS]; cuda-based OpenCL can't handle the const
	
	// initialize the local, shared-memory bins
	for (int i = locID; i < BINS; i += localSize)
		localH[i] = 0;
	
	// wait until all threads get here
	barrier(CLK_LOCAL_MEM_FENCE);
	
	//start processing the image data
	for (int i = gloID; i < N; i += GRIDSIZE)
	{
		int temp = in[i];
		atomic_add (localH + (temp & 0xFF), 1);
		atomic_add (localH + ((temp >> 8) & 0xFF), 1);
		atomic_add (localH + ((temp >> 16) & 0xFF), 1);
		atomic_add (localH + ((temp >> 24) & 0xFF), 1);
	}
	
	// wait for all warps to complete the local calculations, before updating the global counts
	barrier(CLK_GLOBAL_MEM_FENCE);
	
	// use atomic operations to add the local findings to the global memory bins
	for (int i = locID; i < BINS; i += localSize)
		atomic_add (h + i, localH[i]);
	
}

//*****************************************************************
__kernel
void GPU_histogram_dynamic (__global int *in, int N, __global int *h, __local int* localH, int bins)
{
	// const int BINS = 256; --> replaced by parameter "bins"
	int gloID = get_global_id(0);
	int locID = get_local_id(0);
	int GRIDSIZE = get_global_size(0);
	int localSize = get_local_size(0);
	// __local int localH[256]; --> replaced by parameter "localH"

	// initialize the local bins
	for (int i = locID; i < bins; i += localSize)
		localH[i] = 0;
	
	// wait for all warps to complete the previous step
	barrier(CLK_LOCAL_MEM_FENCE);
	
	//start processing the image data
	for (int i = gloID; i < N; i += GRIDSIZE)
	{
		int temp = in[i];
		atomic_add (localH + (temp & 0xFF), 1);
		atomic_add (localH + ((temp >> 8) & 0xFF), 1);
		atomic_add (localH + ((temp >> 16) & 0xFF), 1);
		atomic_add (localH + ((temp >> 24) & 0xFF), 1);
	}
	
	// wait for all warps to complete the local calculations, before updating the global counts
	barrier(CLK_GLOBAL_MEM_FENCE);
	
	// use atomic operations to add the local findings to the global memory bins
	for (int i = locID; i < bins; i += localSize)
		atomic_add (h + i, localH[i]);
}