Вы можете использовать трехмерные индикаторы в CUDA 4.0 и вычислительные возможности 2.0+. Пример кода:
int blocksInX = (nx+8-1)/8;
int blocksInY = (ny+8-1)/8;
int blocksInZ = (nz+8-1)/8;
dim3 Dg(blocksInX, blocksInY, blocksInZ);
dim3 Db(8, 8, 8);
foo_kernel<<Dg, Db>>(R, nx, ny, nz);
...
__global__ void foo_kernel( float* R, const int nx, const int ny, const int nz )
{
unsigned int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int zIndex = blockDim.z * blockIdx.z + threadIdx.z;
if ( (xIndex < nx) && (yIndex < ny) && (zIndex < nz) )
{
unsigned int index_out = xIndex + nx*yIndex + nx*ny*zIndex;
...
R[index_out] = ...;
}
}
Если ваше устройство не поддерживает вычислительные возможности 2.0, есть несколько хитростей:
int threadsInX = 16;
int threadsInY = 4;
int threadsInZ = 4;
int blocksInX = (nx+threadsInX-1)/threadsInX;
int blocksInY = (ny+threadsInY-1)/threadsInY;
int blocksInZ = (nz+threadsInZ-1)/threadsInZ;
dim3 Dg = dim3(blocksInX, blocksInY*blocksInZ);
dim3 Db = dim3(threadsInX, threadsInY, threadsInZ);
foo_kernel<<<Dg, Db>>>(R, nx, ny, nz, blocksInY, 1.0f/(float)blocksInY);
__global__ void foo_kernel(float *R, const int nx, const int ny, const int nz,
unsigned int blocksInY, float invBlocksInY)
{
unsigned int blockIdxz = __float2uint_rd(blockIdx.y * invBlocksInY);
unsigned int blockIdxy = blockIdx.y - __umul24(blockIdxz, blocksInY);
unsigned int xIndex = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
unsigned int yIndex = __umul24(blockIdxy, blockDim.y) + threadIdx.y;
unsigned int zIndex = __umul24(blockIdxz, blockDim.z) + threadIdx.z;
if ( (xIndex < nx) && (yIndex < xIndex) && (zIndex < nz) )
{
unsigned int index = xIndex + nx*yIndex + nx*ny*zIndex;
...
R[index] = ...;
}
}