Я пытаюсь суммировать уменьшение массива из ядра без необходимости отправлять данные обратно на хост ЦП, но я не получаю правильных результатов. Вот ядро суммы, которое я использую (немного измененное по сравнению с предоставленным NVIDIA):
template <class T, unsigned int blockSize, bool nIsPow2>
__device__ void
reduce(T *g_idata, T *g_odata, unsigned int n)
{
__shared__ T sdata[blockSize];
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockSize*2 + threadIdx.x;
unsigned int gridSize = blockSize*2*gridDim.x;
T mySum = 0;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n)
{
mySum += g_idata[i];
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n)
mySum += g_idata[i+blockSize];
i += gridSize;
}
// each thread puts its local sum into shared memory
sdata[tid] = mySum;
__syncthreads();
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] = mySum = mySum + sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] = mySum = mySum + sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] = mySum = mySum + sdata[tid + 64]; } __syncthreads(); }
#ifndef __DEVICE_EMULATION__
if (tid < 32)
#endif
{
// now that we are using warp-synchronous programming (below)
// we need to declare our shared memory volatile so that the compiler
// doesn't reorder stores to it and induce incorrect behavior.
volatile T* smem = sdata;
if (blockSize >= 64) { smem[tid] = mySum = mySum + smem[tid + 32]; EMUSYNC; }
if (blockSize >= 32) { smem[tid] = mySum = mySum + smem[tid + 16]; EMUSYNC; }
if (blockSize >= 16) { smem[tid] = mySum = mySum + smem[tid + 8]; EMUSYNC; }
if (blockSize >= 8) { smem[tid] = mySum = mySum + smem[tid + 4]; EMUSYNC; }
if (blockSize >= 4) { smem[tid] = mySum = mySum + smem[tid + 2]; EMUSYNC; }
if (blockSize >= 2) { smem[tid] = mySum = mySum + smem[tid + 1]; EMUSYNC; }
}
// write result for this block to global mem
if (tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
template <unsigned int blockSize>
__global__ void compute( int *values, int *temp, int *temp2, int* results, unsigned int N, unsigned int M )
{
int tdx = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tdx;
int val = 0;
int cpt = 0;
if( idx < N )
{
for( int i = 0; i < M; ++i )
{
for( int j = i+1; j < M; ++j )
{
val = values[i*N+idx];
__syncthreads();
reduce<int, blockSize, false>( temp, temp2, N );
__syncthreads();
if( tdx == 0 )
{
val = 0;
for( int k=0; k < gridDim.x; ++k )
{
val += temp2[k];
temp2[k] = 0;
}
results[cpt] = val;
}
__syncthreads();
++cpt;
}
}
}
}
Я что-то упустил? Спасибо!