Я конвертирую свой исходный код C в Cuda для повышения производительности, но значения как C, так и кода Cuda не совпадают. Cuda дает неверный результат. Нет ошибок нигде в отношении проверки ошибок Cuda. Когда я запускаю ядро, значения не совпадают с исходным кодом C. Мой код: -
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <iostream>
#define max 673
using real_sim = float;
// Error checking macro
void allocate_array_2d(real_sim **&pDouble, const int dim1, const int dim2) {
// Contiguous allocation of 2D arrays
// Referenced from:
// http://www.trevorsimonton.com/blog/2016/11/16/transfer-2d-array-memory-to-cuda.html
// and
// https://dev.to/drakargx/c-contiguous-allocation-of-2-d-arrays-446m
// with error correction i=0 to i=1
pDouble = new real_sim * [dim1];
pDouble[0] = new real_sim[dim1 * dim2];
for (int i = 1; i < dim1; i++) pDouble[i] = pDouble[i-1] + dim2;
for (int i = 0; i < dim1; i++) {
for (int j = 0; j < dim2;j++) {
pDouble[i][j] = 0;
}
}
}
#define cudaCheckError(code) \
{ \
if ((code) != cudaSuccess) { \
fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
cudaGetErrorString(code)); \
} \
}
__global__ void kernel_1d(float*da,float*db,float*dc)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int of = x * max + y;
int z = 0;
if (x < max && x>0 && y < max && y>0) {
int a = 0;
int b = 0;
// dc[of] = 2*db[x * max + y ] * da[(x ) * max + y];
if(x<10) a *= db[x * max + y] * da[(x)*max + y] + da[of];
a*= db[x * max + y] * da[(x)*max + y] + da[of];
b+= db[x * max + y] * da[(x)*max + y] - da[of];
db[(x ) * max + y] = db[x * max + y ]* da[(x)*max + y] + a;
dc[(x)*max + y] += 2*db[x * max + y] /b;
// printf("block %d, thread %d, index (%d, %d)\n", blockIdx.x, threadIdx.x, x,
}
__syncthreads();
}
int main()
{
real_sim** a;
real_sim** b;
real_sim** c;
real_sim** cpu;//host array check device ouput..
allocate_array_2d(a,max,max);//input array
allocate_array_2d(b,max,max);//input array
allocate_array_2d(c,max,max);//output array for device code
allocate_array_2d(cpu, max, max);//output array for C code
for (int i = 0; i < max; i++) {
for (int j = 0; j < max; j++) {
a[i][j] = j+1;
b[i][j] = j+1;
c[i][j] = 0;
cpu[i][j] = 0;
}
}
float* da;
float* db;
float* dc;
int pixel_count = max * max;
cudaCheckError(cudaMalloc(&da, pixel_count * sizeof(float)));
cudaCheckError(cudaMalloc(&db, pixel_count * sizeof(float)));
cudaCheckError(cudaMalloc(&dc, pixel_count * sizeof(float)));
std::cout.precision(17);
for (int k = 1; k < max; k++) {
std::cout << " " << k << "\n";
cudaCheckError(cudaMemcpy(da, a[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
cudaCheckError(cudaMemcpy(db, b[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
cudaCheckError(cudaMemcpy(dc, c[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
// calculation for C code.........................
dim3 thread(32, 32);
dim3 block((max+31)/32, (max + 31) / 32);
kernel_1d << <block, thread >> > (da, db, dc);
cudaCheckError(cudaDeviceSynchronize());
// copy value to host for result comparison...........
cudaCheckError(cudaMemcpy(c[0], dc, pixel_count * sizeof(float), cudaMemcpyDeviceToHost));
// calculation for C code.........................
for (int i = 1; i < max; i++) {
for (int j = 1; j < max; j++) {
int a1 = 0, b1 = 0;
if (i < 10) a1 *= b[i][j] * a[i][j] + a[i][j];
a1 *= b[i][j] * a[i][j] + a[i][j];
b1 += b[i][j] * a[i][j] - a[i][j];
b[i][j] = b[i][j] * a[i][j] + a1;
cpu[i][j] += 2 * b[i][j] / b1;
}
}
///Compairing CPU and GPU code results..........................
for (int i = 0; i < max; i++) {
for (int j = 0; j < max; j++) {
if (c[i][j] != cpu[i][j]){
std::cout << "BREAK DUE TO MIS MATCH OF GPU AND CPU RESULTS" << "\n";
std::cout <<"i= "<< i << " j= " << j << "\n";//index
std::cout <<"c[i][j]="<< c[i][j] << " cpu[i][j]= " << cpu[i][j] << "\n";//values
exit(0);//exiting when output do not match..............
}
}
}
std::cout << "\nSUCESS" << "\n";
}
}