CUDA |
// DAXPY in CUDA (from file: daxpy.cu)
void doDAXPY(double a, const double* xHost, const double* yHost,
int n, double* zHost)
{
… (including a series of calls to create and initialize GPU buffers)
daxpy<<<nblocks, threadsPerBlock>>>(a, xGPU, yGPU, n, zGPU);
…
} | // DAXPY in CUDA (from file: daxpy.cu)
__global__
void daxpy(double a, const double* x, const double* y,
int n, double* z)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
z[i] = a*x[i] + y[i];
} |
OpenCL |
// DAXPY in OpenCL (from file: daxpy.c++)
void doDAXPY(double a, const double* xHost, const double* yHost,
int n, double* zHost)
{
… (including a series of calls to create and initialize GPU buffers)
… (including a series of "clSetKernelArg" calls to bind parameters), e.g.:
clSetKernelArg(kernel, 0, sizeof(double), &a);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferX);
// ... remainder of the clSetKernelArg calls ...
clEnqueueNDRangeKernel(cmdQueue,
kernel, // object created by clCreateKernel
workDimension, // here it is 1
globalWorkOffset, // []; nullptr ==> none
globalWorkSize, // []; { ceil(n/localWorkSize)*localWorkSize }
localWorkSize, // []; here {threadsPerBlock}; can be nullptr
numEvents, // here 0 ==> none
eventWaitList, // nullptr==> none
event); // nullptr==> do not generate one)
…
}
| // DAXPY in OpenCL (from file: daxpy.cl)
__kernel
void daxpy(double a, __global const double* x, __global const double* y,
int n, __global double* z)
{
int i = get_global_id(0);
if (i < n)
z[i] = a*x[i] + y[i];
} |