В настоящее время я пытаюсь запустить простую многопроцессорную программу с CUDA.В основном он копирует большой массив с некоторыми фиктивными данными в виде блоков в графические процессоры, которые выполняют некоторые математические операции, а затем копируют полученный массив обратно.
Я не получаю никаких ошибок в выводе VS2017,но некоторые сообщения об ошибках, которые я настроил, показывают, что при попытке скопировать H2D или D2H.Это говорит мне, что происходит cudaErrorInvalidValue.Также при использовании cudaFree ();функция, я получаю ошибку cudaErrorInvalidDevicePointer.
Вывод программы, результат, совершенно неверно.В целях тестирования ядро только устанавливает для каждого значения выходного массива значение 50. Результатом является относительно большое отрицательное число, всегда одинаковое независимо от того, что делает ядро.
У меня уже естьпопытался использовать указатель, который не является частью структуры, но определен непосредственно перед cudaMalloc, где он используется первым.Это ничего не изменило.
Это функция, которая запускает ядро:
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
Далее, как структура определяется в "kernel.h":
#ifndef KERNEL_H
#define KERNEL_H
#include "cuda_runtime.h"
//GPU plan
typedef struct
{
unsigned int Computations; //computations on this GPU
unsigned int Repetitions; // amount of kernel repetitions
unsigned int ComputationsPerRepetition; // amount of computations in every kernel execution
unsigned int AdditionalComputationRepetitionsCount; // amount of repetitions that need to do one additional computation
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this GPU has to start working
float* d_data_ptr;
float* d_out_ptr;
cudaStream_t stream;
} GPUplan;
typedef struct
{
unsigned int Computations;
unsigned int ComputationsPerThread; // number of computations every thread of this repetition on this GPU has to do
unsigned int AdditionalComputationThreadCount; // number of threads in this repetition on this GPU that have to
unsigned int DataStartingPoint; // tells the kernel launch at which point in the DATA array this repetition has to start working
} KernelPlan;
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter);
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N);
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan);
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan);
void memFree(int device, GPUplan gpuPlan);
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount);
#endif
здесьчасть кода, которая вызывает runKernel:
int GPU_N;
cudaGetDeviceCount(&GPU_N);
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
Я не думаю, что само ядро будет здесь иметь какое-либо значение, так как ошибка memcpy уже появляется еще до того, как она будет выполнена.
Ожидаемыйвывод состоит в том, что каждый элемент выходного массива равен 50. Вместо этого каждый элемент имеет значение -431602080.0
Массив является массивом с плавающей запятой.
РЕДАКТИРОВАТЬ: вот полный код, используемый для воспроизведенияпроблема (в дополнение к kernel.h сверху):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include "kernel.h"
#define MAX_GPU_COUNT 32
#define MAX_REP_COUNT 64
__global__ void dummyKernel(float *d_data, float *d_out, int d_ComputationsPerThread, int d_AdditionalComputationThreadCount) {
int computations = d_ComputationsPerThread; //computations to be performed in this repetition on this GPU
const int threadID = blockDim.x * blockIdx.x + threadIdx.x; //thread id within GPU Repetition
if (threadID > d_AdditionalComputationThreadCount) {
computations++; //check if thread has to do an additional computation
}
for (int i = 0; i < computations; i++) {
d_out[i * blockDim.x * gridDim.x + threadID] = 50;
}
}
GPUplan planGPUComputation(int DATA_N, int GPU_N, int device, long long MemoryPerComputation, int dataCounter)
{
GPUplan plan;
size_t free, total;
//computations on GPU #device
plan.Computations = DATA_N / GPU_N;
//take into account odd data size for this GPU
if (DATA_N % GPU_N > device) {
plan.Computations++;
}
plan.DataStartingPoint = dataCounter;
//get memory information
cudaSetDevice(device);
cudaMemGetInfo(&free, &total);
//calculate Repetitions on this GPU #device
plan.Repetitions = ((plan.Computations * MemoryPerComputation / free) + 1);
printf("Repetitions: %i\n", plan.Repetitions);
if (plan.Repetitions > MAX_REP_COUNT) {
printf("Repetition count larger than MAX_REP_COUNT %i\n\n", MAX_REP_COUNT);
}
//calculate Computations per Repetition
plan.ComputationsPerRepetition = plan.Computations / plan.Repetitions;
//calculate how many Repetitions have to do an additional Computation
plan.AdditionalComputationRepetitionsCount = plan.Computations % plan.Repetitions;
return plan;
}
KernelPlan planKernelComputation(int GPUDataStartingPoint, int GPUComputationsPerRepetition, int GPUAdditionalComputationRepetitionsCount, int Repetition, int dataCounter, int THREAD_N, int BLOCK_N)
{
KernelPlan plan;
//calculate total Calculations in this Repetition
plan.Computations = GPUComputationsPerRepetition;
if (GPUAdditionalComputationRepetitionsCount > Repetition) {
plan.Computations++;
}
plan.ComputationsPerThread = plan.Computations / (THREAD_N * BLOCK_N); // Computations every thread has to do (+- 1)
plan.AdditionalComputationThreadCount = plan.Computations % (THREAD_N * BLOCK_N); // how many threads have to do +1 calculation
plan.DataStartingPoint = GPUDataStartingPoint + dataCounter;
return plan;
}
void memAllocation(int device, int MemoryPerComputation, GPUplan gpuPlan, KernelPlan kernelPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Error Selecting device %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_data_ptr), MemoryPerComputation * kernelPlan.Computations); // device data array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 1 on GPU %i: Error %i\n", device, x);
}
cudaMalloc((void**)&(gpuPlan.d_out_ptr), MemoryPerComputation * kernelPlan.Computations); // device output array memory allocation
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Malloc 2 on GPU %i: Error %i\n", device, x);
}
}
void runKernel(int device, int Repetition, float* h_data, float* h_out, int MemoryPerComputation, int BLOCK_N, int THREAD_N, GPUplan gpuplan, KernelPlan kernelPlan)
{
cudaSetDevice(device);
cudaStreamCreate(&gpuplan.stream);
cudaMemcpyAsync(gpuplan.d_data_ptr, h_data, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyHostToDevice, gpuplan.stream); //asynchronous memory copy of the data array h2d
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy H2D on GPU %i: Error %i\n", device, x);
}
dummyKernel << <BLOCK_N, THREAD_N, 0, gpuplan.stream >> > (gpuplan.d_data_ptr, gpuplan.d_out_ptr, kernelPlan.ComputationsPerThread, kernelPlan.AdditionalComputationThreadCount); //run kernel
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("no successfull kernel launch\n Kernel Launch Error %i \n", x);
}
else {
printf("kernel ran.\n");
}
cudaMemcpyAsync(h_out, gpuplan.d_out_ptr, kernelPlan.Computations * MemoryPerComputation, cudaMemcpyDeviceToHost, gpuplan.stream); //asynchronous memory copy of the output array d2h
x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memcpy D2H on GPU %i: Error %i\n", device, x);
}
cudaStreamDestroy(gpuplan.stream);
}
void memFree(int device, GPUplan gpuPlan)
{
cudaSetDevice(device); //select device to allocate memory on
cudaFree(gpuPlan.d_data_ptr);
cudaFree(gpuPlan.d_out_ptr);
cudaError_t x = cudaGetLastError();
if (x != cudaSuccess) {
printf("Memfree on GPU %i: Error %i\n", device, x);
}
else {
printf("memory freed.\n");
}
//17 = cudaErrorInvalidDevicePointer
}
int main()
{
//get device count
int GPU_N;
cudaGetDeviceCount(&GPU_N);
//adjust for device count larger than MAX_GPU_COUNT
if (GPU_N > MAX_GPU_COUNT)
{
GPU_N = MAX_GPU_COUNT;
}
printf("GPU count: %i\n", GPU_N);
//definitions for running the program
const int BLOCK_N = 32;
const int THREAD_N = 1024;
const int DATA_N = 144000;
const int MemoryPerComputation = sizeof(float);
///////////////////////////////////////////////////////////
//Subdividing input data across GPUs
//////////////////////////////////////////////
//GPUplan
GPUplan plan[MAX_GPU_COUNT];
int dataCounter = 0;
for (int i = 0; i < GPU_N; i++)
{
plan[i] = planGPUComputation(DATA_N, GPU_N, i, MemoryPerComputation, dataCounter);
dataCounter += plan[i].Computations;
}
//KernelPlan
KernelPlan kernelPlan[MAX_GPU_COUNT*MAX_REP_COUNT];
for (int i = 0; i < GPU_N; i++)
{
int GPURepetitions = plan[i].Repetitions;
dataCounter = plan[i].DataStartingPoint;
for (int j = 0; j < GPURepetitions; j++)
{
kernelPlan[i*MAX_REP_COUNT + j] = planKernelComputation(plan[i].DataStartingPoint, plan[i].ComputationsPerRepetition, plan[i].AdditionalComputationRepetitionsCount, j, dataCounter, THREAD_N, BLOCK_N);
dataCounter += kernelPlan[i*MAX_REP_COUNT + j].Computations;
}
}
float *h_data;
float *h_out;
h_data = (float *)malloc(MemoryPerComputation * DATA_N);
h_out = (float *)malloc(MemoryPerComputation * DATA_N);
//generate some input data
for (int i = 0; i < DATA_N; i++) {
h_data[i] = 2 * i;
}
//get highest repetition count
int maxRepetitionCount = 0;
for (int i = 0; i < GPU_N; i++) {
if (plan[i].Repetitions > maxRepetitionCount) {
maxRepetitionCount = plan[i].Repetitions;
}
}
printf("maxRepetitionCount: %i\n\n", maxRepetitionCount);
float* sourcePointer;
float* destPointer;
for (int i = 0; i < maxRepetitionCount; i++) // repeat this enough times so that the GPU with the most repetitions will get through all of them
{
//malloc
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memAllocation(j, MemoryPerComputation, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
//kernel launch/memcpy
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
sourcePointer = h_data + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
destPointer = h_out + kernelPlan[j*MAX_REP_COUNT + i].DataStartingPoint;
runKernel(j, i, sourcePointer, destPointer, MemoryPerComputation, BLOCK_N, THREAD_N, plan[j], kernelPlan[j*MAX_REP_COUNT + i]);
}
}
for (int j = 0; j < GPU_N; j++)
{
if (plan[j].Repetitions >= i) // when this GPU has to do at least i repetitions
{
memFree(j, plan[j]);
}
}
}
//printing expected results and results
for (int i = 0; i < 50; i++)
{
printf("%f\t", h_data[i]);
printf("%f\n", h_out[i]);
}
free(h_data);
free(h_out);
getchar();
return 0;
}