Программа Cuda показывает неправильные результаты, но не дает ошибок - PullRequest
0 голосов
/ 04 мая 2020

Я конвертирую свой исходный код C в Cuda для повышения производительности, но значения как C, так и кода Cuda не совпадают. Cuda дает неверный результат. Нет ошибок нигде в отношении проверки ошибок Cuda. Когда я запускаю ядро, значения не совпадают с исходным кодом C. Мой код: -

#include <cuda_runtime_api.h>
#include <stdio.h>
#include <iostream>
#define   max  673
using real_sim = float;
// Error checking macro
void allocate_array_2d(real_sim **&pDouble, const int dim1, const int dim2) {
  // Contiguous allocation of 2D arrays
  // Referenced from:
  // http://www.trevorsimonton.com/blog/2016/11/16/transfer-2d-array-memory-to-cuda.html
  // and
  // https://dev.to/drakargx/c-contiguous-allocation-of-2-d-arrays-446m
  // with error correction i=0 to i=1


  pDouble = new real_sim * [dim1];
  pDouble[0] = new real_sim[dim1 * dim2];
  for (int i = 1; i < dim1; i++) pDouble[i] = pDouble[i-1] + dim2;

  for (int i = 0; i < dim1; i++) {
    for (int j = 0; j < dim2;j++) {
      pDouble[i][j] = 0;
    }
  }
}
#define cudaCheckError(code)                                             \
  {                                                                      \
    if ((code) != cudaSuccess) {                                         \
      fprintf(stderr, "Cuda failure %s:%d: '%s' \n", __FILE__, __LINE__, \
              cudaGetErrorString(code));                                 \
    }                                                                    \
  }

__global__ void kernel_1d(float*da,float*db,float*dc)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
   int y = blockIdx.y * blockDim.y + threadIdx.y;
   int of = x * max + y;
   int z = 0;
   if (x < max && x>0 && y < max && y>0) {
       int a = 0;
       int b = 0;
      // dc[of] = 2*db[x * max + y ] * da[(x ) * max + y];
      if(x<10) a *= db[x * max + y] * da[(x)*max + y] + da[of];
      a*= db[x * max + y] * da[(x)*max + y] + da[of];
      b+= db[x * max + y] * da[(x)*max + y] - da[of];

       db[(x ) * max + y] = db[x * max + y ]* da[(x)*max + y] + a;
       dc[(x)*max + y] += 2*db[x * max + y] /b;

       // printf("block %d, thread %d, index (%d, %d)\n", blockIdx.x, threadIdx.x, x,
   }
   __syncthreads();
}

int main()

{
   real_sim** a;
   real_sim** b;
   real_sim** c;
   real_sim** cpu;//host array check device ouput..

   allocate_array_2d(a,max,max);//input array
   allocate_array_2d(b,max,max);//input array
   allocate_array_2d(c,max,max);//output array for device code
   allocate_array_2d(cpu, max, max);//output array for C code

    for (int i = 0; i < max; i++) {
        for (int j = 0; j < max; j++) {
            a[i][j] = j+1;
            b[i][j] = j+1;
            c[i][j] = 0;
            cpu[i][j] = 0;
        }
    }


    float* da;
    float* db;
    float* dc;
    int pixel_count = max * max;

    cudaCheckError(cudaMalloc(&da, pixel_count * sizeof(float)));
    cudaCheckError(cudaMalloc(&db, pixel_count * sizeof(float)));
    cudaCheckError(cudaMalloc(&dc, pixel_count * sizeof(float)));

    std::cout.precision(17);

    for (int k = 1; k < max; k++) {
        std::cout << "  " << k  << "\n";

        cudaCheckError(cudaMemcpy(da, a[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
        cudaCheckError(cudaMemcpy(db, b[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
        cudaCheckError(cudaMemcpy(dc, c[0], pixel_count * sizeof(float), cudaMemcpyHostToDevice));
        //  calculation for C code.........................
        dim3 thread(32, 32);
        dim3 block((max+31)/32, (max + 31) / 32);
        kernel_1d << <block, thread >> > (da, db, dc);
        cudaCheckError(cudaDeviceSynchronize());
        // copy value to host for result comparison...........
        cudaCheckError(cudaMemcpy(c[0], dc, pixel_count * sizeof(float), cudaMemcpyDeviceToHost));


        //  calculation for C code.........................

        for (int i = 1; i < max; i++) {
            for (int j = 1; j < max; j++) {
                int a1 = 0, b1 = 0;
                if (i < 10) a1 *= b[i][j] * a[i][j] + a[i][j];
                a1 *= b[i][j] * a[i][j] + a[i][j];
                b1 += b[i][j] * a[i][j] - a[i][j];

                b[i][j] = b[i][j] * a[i][j] + a1;
                cpu[i][j] += 2 * b[i][j] / b1;
            }
        }

        ///Compairing CPU and GPU code results..........................

        for (int i = 0; i < max; i++) {
            for (int j = 0; j < max; j++) {


                if (c[i][j] != cpu[i][j]){
                    std::cout << "BREAK DUE TO MIS MATCH OF GPU AND CPU RESULTS" << "\n";
                    std::cout <<"i= "<< i << " j=  " << j << "\n";//index
                    std::cout <<"c[i][j]="<< c[i][j] << "  cpu[i][j]= " << cpu[i][j] << "\n";//values

                    exit(0);//exiting when output do not match.............. 
                }
            }
        }
        std::cout << "\nSUCESS" << "\n";
    }


}
...