Partial Comparison:
DAXPY in CUDA versus OpenCL

z[ ] = a * x[ ] + y[ ];

	Launching DAXPY from CPU	DAXPY on GPU
CUDA	// DAXPY in CUDA (from file: daxpy.cu) void doDAXPY(double a, const double* xHost, const double* yHost, int n, double* zHost) { … (including a series of calls to create and initialize GPU buffers) daxpy<<<nblocks, threadsPerBlock>>>(a, xGPU, yGPU, n, zGPU); … }	// DAXPY in CUDA (from file: daxpy.cu) __global__ void daxpy(double a, const double* x, const double* y, int n, double* z) { int i = blockIdx.xblockDim.x + threadIdx.x; if (i < n) z[i] = ax[i] + y[i]; }
OpenCL	// DAXPY in OpenCL (from file: daxpy.c++) void doDAXPY(double a, const double* xHost, const double* yHost, int n, double* zHost) { … (including a series of calls to create and initialize GPU buffers) … (including a series of "clSetKernelArg" calls to bind parameters), e.g.: clSetKernelArg(kernel, 0, sizeof(double), &a); clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferX); // ... remainder of the clSetKernelArg calls ... clEnqueueNDRangeKernel(cmdQueue, kernel, // object created by clCreateKernel workDimension, // here it is 1 globalWorkOffset, // []; nullptr ==> none globalWorkSize, // []; { ceil(n/localWorkSize)*localWorkSize } localWorkSize, // []; here {threadsPerBlock}; can be nullptr numEvents, // here 0 ==> none eventWaitList, // nullptr==> none event); // nullptr==> do not generate one) … }	// DAXPY in OpenCL (from file: daxpy.cl) __kernel void daxpy(double a, __global const double* x, __global const double* y, int n, __global double* z) { int i = get_global_id(0); if (i < n) z[i] = a*x[i] + y[i]; }

Launching DAXPY from CPU

DAXPY on GPU

CUDA

// DAXPY in CUDA (from file: daxpy.cu)
void doDAXPY(double a, const double* xHost, const double* yHost,
             int n, double* zHost)
{
    … (including a series of calls to create and initialize GPU buffers)
    daxpy<<<nblocks, threadsPerBlock>>>(a, xGPU, yGPU, n, zGPU);
    …
}

// DAXPY in CUDA (from file: daxpy.cu)
__global__
void daxpy(double a, const double* x, const double* y,
           int n, double* z)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i < n)
        z[i] = a*x[i] + y[i];
}

OpenCL

// DAXPY in OpenCL (from file: daxpy.c++)
void doDAXPY(double a, const double* xHost, const double* yHost,
             int n, double* zHost)
{
    … (including a series of calls to create and initialize GPU buffers)
    … (including a series of "clSetKernelArg" calls to bind parameters), e.g.:
    clSetKernelArg(kernel, 0, sizeof(double), &a);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferX);
    // ... remainder of the clSetKernelArg calls ...
    clEnqueueNDRangeKernel(cmdQueue,
        kernel, // object created by clCreateKernel
        workDimension, // here it is 1
        globalWorkOffset, // []; nullptr ==> none
        globalWorkSize, // []; { ceil(n/localWorkSize)*localWorkSize }
        localWorkSize, // []; here {threadsPerBlock}; can be nullptr
        numEvents, // here 0 ==> none
        eventWaitList, // nullptr==> none
        event); // nullptr==> do not generate one)
    …
}

// DAXPY in OpenCL (from file: daxpy.cl)
__kernel
void daxpy(double a, __global const double* x, __global const double* y,
           int n, __global double* z)
{
    int i = get_global_id(0);
    if (i < n)
        z[i] = a*x[i] + y[i];
}

General Comments

Instead of one process doing:
```
for (int i=0 ; i<n ; i++)
    z[i] = a * x[i] + y[i];
```
the kernel launches shown above initiate n processes on the GPU, each of which uses its global ID to set one position of the z array as shown in the right-hand column in the table above.
Comparing CUDA's launch configuration to OpenCL's:
- nblocks = globalWorkSize / localWorkSize.
- threadsPerBlock = localWorkSize
Examples:
1. n = 16384 ⇒ globalWorkSize = 16384
  We will choose threadsPerBlock = 128 ⇒ nBlocks = 128.
2. n = 17000 ⇒ globalWorkSize = 17024
  (assuming we keep threadsPerBlock = 128) ⇒ nBlocks = 133.

Partial Comparison:DAXPY in CUDA versus OpenCL

z[ ] = a * x[ ] + y[ ];

General Comments

Partial Comparison:
DAXPY in CUDA versus OpenCL