Джесси Холл уже ответил на ваш вопрос, поэтому я ограничусь, чтобы дополнить его ответ.
Занятость - не единственное достоинство, о котором нужно заботиться, чтобы максимизировать производительность алгоритма, которая чаще всего совпадает со временем выполнения. Предлагаю взглянуть на поучительную презентацию GTC2010 Василия Волкова:
Лучшая производительность при низкой занятости
Ниже я приведу простой пример, вдохновленный второй частью вышеприведенной презентации.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define BLOCKSIZE 512
//#define DEBUG
/*******************/
/* iDivUp FUNCTION */
/*******************/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/***********************************************/
/* MEMCPY1 - EACH THREAD COPIES ONE FLOAT ONLY */
/***********************************************/
__global__ void memcpy1(float *src, float *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < N) {
float a0 = src[tid];
dst[tid] = a0;
}
}
/*******************************************/
/* MEMCPY2 - EACH THREAD COPIES TWO FLOATS */
/*******************************************/
__global__ void memcpy2(float *src, float *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * (2 * blockDim.x);
if (tid < N) {
float a0 = src[tid];
float a1 = src[tid + blockDim.x];
dst[tid] = a0;
dst[tid + blockDim.x] = a1;
}
}
/********************************************/
/* MEMCPY4 - EACH THREAD COPIES FOUR FLOATS */
/********************************************/
__global__ void memcpy4(float *src, float *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N) {
float a0 = src[tid];
float a1 = src[tid + blockDim.x];
float a2 = src[tid + 2 * blockDim.x];
float a3 = src[tid + 3 * blockDim.x];
dst[tid] = a0;
dst[tid + blockDim.x] = a1;
dst[tid + 2 * blockDim.x] = a2;
dst[tid + 3 * blockDim.x] = a3;
}
}
/***********************************************/
/* MEMCPY4_2 - EACH THREAD COPIES FOUR FLOATS2 */
/***********************************************/
__global__ void memcpy4_2(float2 *src, float2 *dst, unsigned int N)
{
const int tid = threadIdx.x + blockIdx.x * (4 * blockDim.x);
if (tid < N/2) {
float2 a0 = src[tid];
float2 a1 = src[tid + blockDim.x];
float2 a2 = src[tid + 2 * blockDim.x];
float2 a3 = src[tid + 3 * blockDim.x];
dst[tid] = a0;
dst[tid + blockDim.x] = a1;
dst[tid + 2 * blockDim.x] = a2;
dst[tid + 3 * blockDim.x] = a3;
}
}
/********/
/* MAIN */
/********/
void main()
{
const int N = 131072;
const int N_iter = 20;
// --- Setting host data and memory space for result
float* h_vect = (float*)malloc(N*sizeof(float));
float* h_result = (float*)malloc(N*sizeof(float));
for (int i=0; i<N; i++) h_vect[i] = i;
// --- Setting device data and memory space for result
float* d_src; gpuErrchk(cudaMalloc((void**)&d_src, N*sizeof(float)));
float* d_dest1; gpuErrchk(cudaMalloc((void**)&d_dest1, N*sizeof(float)));
float* d_dest2; gpuErrchk(cudaMalloc((void**)&d_dest2, N*sizeof(float)));
float* d_dest4; gpuErrchk(cudaMalloc((void**)&d_dest4, N*sizeof(float)));
float* d_dest4_2; gpuErrchk(cudaMalloc((void**)&d_dest4_2, N*sizeof(float)));
gpuErrchk(cudaMemcpy(d_src, h_vect, N*sizeof(float), cudaMemcpyHostToDevice));
// --- Warmup
for (int i=0; i<N_iter; i++) memcpy1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest1, N);
// --- Creating events for timing
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
/***********/
/* MEMCPY1 */
/***********/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy1<<<iDivUp(N,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest1, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest1, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
/***********/
/* MEMCPY2 */
/***********/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy2<<<iDivUp(N/2,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest2, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest2, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
/***********/
/* MEMCPY4 */
/***********/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy4<<<iDivUp(N/4,BLOCKSIZE), BLOCKSIZE>>>(d_src, d_dest4, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest4, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
/*************/
/* MEMCPY4_2 */
/*************/
cudaEventRecord(start, 0);
for (int i=0; i<N_iter; i++) {
memcpy4_2<<<iDivUp(N/8,BLOCKSIZE), BLOCKSIZE>>>((float2*)d_src, (float2*)d_dest4_2, N);
#ifdef DEGUB
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
#endif
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("GB/s = %f\n", (1.e-6)*(float)(N*N_iter*sizeof(float))/time);
gpuErrchk(cudaMemcpy(h_result, d_dest4_2, N*sizeof(int), cudaMemcpyDeviceToHost));
for (int i=0; i<N; i++) if(h_result[i] != h_vect[i]) { printf("Error at i=%i! Host = %i; Device = %i\n", i, h_vect[i], h_result[i]); return; }
cudaDeviceReset();
}
Ниже приведена производительность вышеуказанного кода при работе на GeForce GT540M и Kepler K20c.
BLOCKSIZE 32
GT540M K20c Tesla C2050
memcpy1 2.3GB/s 13% 28.1GB/s 18% 14.9GB/s 12%
memcpy2 4.4GB/s 13% 41.1GB/s 18% 24.8GB/s 13%
memcpy4 7.5GB/s 13% 54.8GB/s 18% 34.6GB/s 13%
memcpy4_2 11.2GB/2 14% 68.8GB/s 18% 44.0GB7s 14%
BLOCKSIZE 64
GT540M K20c Tesla C2050
memcpy1 4.6GB/s 27% 44.1GB/s 36% 26.1GB/s 26%
memcpy2 8.1GB/s 27% 57.1GB/s 36% 35.7GB/s 26%
memcpy4 11.4GB/s 27% 63.2GB/s 36% 43.5GB/s 26%
memcpy4_2 12.6GB/s 27% 72.8GB/s 36% 49.7GB/s 27%
BLOCKSIZE 128
GT540M K20c Tesla C2050
memcpy1 8.0GB/s 52% 60.6GB/s 78% 36.1GB/s 52%
memcpy2 11.6GB/2 52% 61.6GB/s 78% 44.8GB/s 52%
memcpy4 12.4GB/2 52% 62.2GB/s 78% 48.3GB/s 52%
memcpy4_2 12.5GB/s 52% 61.9GB/s 78% 49.5GB7s 52%
BLOCKSIZE 256
GT540M K20c Tesla C2050
memcpy1 10.6GB/s 80% 61.2GB/s 74% 42.0GB/s 77%
memcpy2 12.3GB/s 80% 66.2GB/s 74% 48.2GB/s 77%
memcpy4 12.4GB/s 80% 66.4GB/s 74% 45.5GB/s 77%
memcpy4_2 12.6GB/s 70% 72.6GB/s 74% 50.8GB/s 77%
BLOCKSIZE 512
GT540M K20c Tesla C2050
memcpy1 10.3GB/s 80% 54.5GB/s 75% 41.6GB/s 75%
memcpy2 12.2GB/s 80% 67.1GB/s 75% 47.7GB/s 75%
memcpy4 12.4GB/s 80% 67.9GB/s 75% 46.9GB/s 75%
memcpy4_2 12.5GB/s 55% 70.1GB/s 75% 48.3GB/s 75%
Приведенные выше результаты показывают, что вы можете иметь лучшую производительность, например, 12GB/s
для корпуса GT540M, с меньшей загруженностью, например, 27%
, если вы правильно используете Параллелизм уровня команд (ILP) , дав каждый поток проделывает больше работы, чтобы скрыть задержку.