// histogram.cl __kernel void GPU_histogram_static (__global int *in, int N, __global int *h) { const int BINS = 256; int gloID = get_global_id(0); int locID = get_local_id(0); int GRIDSIZE = get_global_size(0); int localSize = get_local_size(0); __local int localH[256]; // BINS]; cuda-based OpenCL can't handle the const // initialize the local, shared-memory bins for (int i = locID; i < BINS; i += localSize) localH[i] = 0; // wait until all threads get here barrier(CLK_LOCAL_MEM_FENCE); //start processing the image data for (int i = gloID; i < N; i += GRIDSIZE) { int temp = in[i]; atomic_add (localH + (temp & 0xFF), 1); atomic_add (localH + ((temp >> 8) & 0xFF), 1); atomic_add (localH + ((temp >> 16) & 0xFF), 1); atomic_add (localH + ((temp >> 24) & 0xFF), 1); } // wait for all warps to complete the local calculations, before updating the global counts barrier(CLK_GLOBAL_MEM_FENCE); // use atomic operations to add the local findings to the global memory bins for (int i = locID; i < BINS; i += localSize) atomic_add (h + i, localH[i]); } //***************************************************************** __kernel void GPU_histogram_dynamic (__global int *in, int N, __global int *h, __local int* localH, int bins) { // const int BINS = 256; --> replaced by parameter "bins" int gloID = get_global_id(0); int locID = get_local_id(0); int GRIDSIZE = get_global_size(0); int localSize = get_local_size(0); // __local int localH[256]; --> replaced by parameter "localH" // initialize the local bins for (int i = locID; i < bins; i += localSize) localH[i] = 0; // wait for all warps to complete the previous step barrier(CLK_LOCAL_MEM_FENCE); //start processing the image data for (int i = gloID; i < N; i += GRIDSIZE) { int temp = in[i]; atomic_add (localH + (temp & 0xFF), 1); atomic_add (localH + ((temp >> 8) & 0xFF), 1); atomic_add (localH + ((temp >> 16) & 0xFF), 1); atomic_add (localH + ((temp >> 24) & 0xFF), 1); } // wait for all warps to complete the local calculations, before updating the global counts barrier(CLK_GLOBAL_MEM_FENCE); // use atomic operations to add the local findings to the global memory bins for (int i = locID; i < bins; i += localSize) atomic_add (h + i, localH[i]); }