Невозможно найти среднее и переменную в одном коде ядра - PullRequest
0 голосов
/ 08 ноября 2010

Уважаемые ученые,
Я не могу реализовать среднее и переменную в одном вызове ядра.

Цель: мне нужно найти среднее и переменную подматриц в матрице.поэтому я написал следующие функции ядра

:

<code>
    <strong>global</strong> void kernelMean(d_a, d_mean, ...);
    <strong>global</strong> void kernelVar(d_a, d_var, ...);
    <strong>global</strong> void kernelMeanVar(d_a, d_mean, d_var,...);

Проблема: а) если я по отдельности вычисляю kernelMean и kernelVar, то все работает нормально; б) если я хочу вычислить kernelMeanVar в одномЯдро не работает.

Ниже приведен мой код, пожалуйста, сообщите мне о возможных ошибках.

Заранее благодарю.

С уважением,
Нагараджу


<strong>global</strong> void kernelMean(float* device_array, float* device_mean, int globalRows, int globalCols,int localRows,int localCols, int numRowElts, int total_num_threads)
{
      int block_id = blockIdx.x + gridDim.x * blockIdx.y;
      int thread_id = (blockDim.x*blockDim.y)* block_id + threadIdx.x;
      int my_rowIdx = thread_id/globalCols ;
      int my_colIdx = thread_id%globalCols ;
      int i,j;
      float temp;
      float sum = 0;
      float sumsq = 0.0;
      float mean;
      float ltotal_elts = (float) (localRows*localCols);</p>

<pre><code>  device_mean[thread_id] = 0;
  if(thread_id <total_num_threads)
  {
    for(i=0; i < localRows ; i++)
    {
      for(j=0 ; j < localCols; j++)
      {
        temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
        sumsq = sumsq + (temp*temp);
        sum = sum + temp;
      }
    }
    mean = sum/ltotal_elts;
    device_mean[thread_id] = mean;

}} global void kernelVar (float * device_array, float * device_var, int globalRows, int globalCols, int localRows, int localCols, int numRowElts, int total_num_threads) {int block_id = blockIdx.x + gridDim.x * blockIdx.y;int thread_id = (blockDim.x * blockDim.y) * block_id + threadIdx.x;int my_rowIdx = thread_id / globalCols;int my_colIdx = thread_id% globalCols;int i, j;плавать темп;сумма с плавающей запятой = 0;float sumsq = 0;среднее значение с плавающей запятой = 0;float var = 0;float ltotal_elts = (float) localRows * localCols;

  device_var[thread_id] = 0;
  if(thread_id < total_num_threads)
  {
    for(i=0; i < localRows ; i++)
    {
      for(j=0 ; j < localCols; j++)
      {
        temp = device_array[(i+ my_rowIdx)*numRowElts + (j+ my_colIdx)];
        sum = sum + temp;
        sumsq = sumsq + (temp*temp);
      }
    }
    mean = sum/ltotal_elts;
    device_var[thread_id]  = (sumsq/ltotal_elts) - (mean*mean);
  }
}

global void kernelMeanVar (float * device_array, float * device_mean, float * device_var, int globalRows, int globalCols, int localRows,int localCols, int numRowElts, int total_num_threads) {int block_id = blockIdx.x + gridDim.x * blockIdx.y;int thread_id = (blockDim.x * blockDim.y) * block_id + threadIdx.x;int my_rowIdx = thread_id / globalCols;int my_colIdx = thread_id% globalCols;int i, j;плавать темп;сумма с плавающей запятой = 0;число с плавающей запятой = 0,0;среднее значение с плавающей точкой;float ltotal_elts = (float) (localRows * localCols);device_mean [thread_id] = 0;device_var [thread_id] = 0;if (thread_id

Функции вызова ядра

<code>
void convertToFloat(float** float_ary, double* double_ary, int num_elts)
{
  for(int i = 0; i < num_elts ; i++)
  {
    (*float_ary)[i] = (float) double_ary[i];
     //printf("float_ary[%d] : %f \n", i, (*float_ary)[i]);
  }
  return;
}
void convertToDouble(double** double_ary, float* float_ary, int num_elts)
{
  for(int i = 0; i < num_elts ; i++)
  {
    (*double_ary)[i] = (double) float_ary[i];
  }
  return;
}
void computeMeanAndVarArray(double* host_array, int num_elts, int globalRows, int globalCols, int localRows, int localCols, int numRowElts, double** mean_ary, double** var_ary)
{
  float* host_array_float;
  float* device_array;
  float* host_mean;
  float* host_var;
  float* device_mean;
  float* device_var;
  double total_bytes =0;
  host_array_float = (float*) malloc (num_elts*sizeof(float));
  convertToFloat(&host_array_float, host_array, num_elts);
  //printf("num_elts %d \n", num_elts);
  cudaMalloc((void**) &device_array, sizeof(float)* num_elts);
  cudaMemset(device_array, 0, sizeof(float)* num_elts);
  cudaMemcpy(device_array, host_array_float,sizeof(float)* num_elts, cudaMemcpyHostToDevice);
  int numBlockThreads = MAX_THREADS_PER_BLOCK;
  int num_blocks = 0;
  int remain_elts = 0;
  int total_num_threads = globalRows * globalCols;
  cudaMalloc((void**) &device_mean, sizeof(float)* total_num_threads);
  cudaMemset(device_mean, 0, sizeof(float)* total_num_threads);
  cudaMalloc((void**) &device_var, sizeof(float)* total_num_threads);
  cudaMemset(device_var, 0, sizeof(float)* total_num_threads);
  num_blocks  =  total_num_threads/numBlockThreads;
  remain_elts =  total_num_threads%numBlockThreads;
  if(remain_elts > 0)
  {
    num_blocks++;
  }
  dim3 gridDim(num_blocks,1);
  dim3 blockDim(numBlockThreads,1);
  //kernelMean<<< gridDim,blockDim >>>(device_array, device_mean,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  //kernelVar<<< gridDim,blockDim >>>(device_array, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  kernelMeanVar<<< gridDim,blockDim >>>(device_array, device_mean, device_var,globalRows, globalCols, localRows,localCols, numRowElts, total_num_threads);
  host_mean = (float*) malloc( sizeof(float) * total_num_threads);
  memset(host_mean, 0, sizeof(float) * total_num_threads);
  host_var = (float*) malloc( sizeof(float) * total_num_threads);
  memset(host_var, 0, sizeof(float) * total_num_threads);
  cudaThreadSynchronize();
  cudaError_t error = cudaGetLastError();
   //if(error!=cudaSuccess) {
      printf("ERROR: %s\n", cudaGetErrorString(error) );
   //}
  cudaMemcpy(host_mean, device_mean, sizeof(float)*total_num_threads, cudaMemcpyDeviceToHost);
  convertToDouble(mean_ary, host_mean, total_num_threads);
  cudaMemcpy(host_var, device_var, sizeof(float)*total_num_threads,     cudaMemcpyDeviceToHost);
  for(int i = 0 ; i < 300 ; i++)
    printf("host_var[%d] %f \n",i, host_var[i]);
  convertToDouble(var_ary, host_var, total_num_threads);
  cudaFree(device_array);
  cudaFree(device_mean);
  cudaFree(device_var);
  free(host_mean);
  free(host_var);
  free(host_array_float);
}

Результаты с включением

<code>
 <strong>global</strong> void kernelMean(d_a, d_mean, ...);
 <strong>global</strong> void kernelVar(d_a, d_var, ...);</p>

<p>ERROR: no error
host_var[0] 4.497070
host_var[1] 5.061768
host_var[2] 5.687500
host_var[3] 6.347534
host_var[4] 6.829102
host_var[5] 12.940308
host_var[6] 14.309937
host_var[7] 15.141113
host_var[8] 18.741577
host_var[9] 21.323608
host_var[10] 21.727417
host_var[11] 192.348389
host_var[12] 579.911621
host_var[13] 800.821045
host_var[14] 1071.960938
host_var[15] 1077.261719
host_var[16] 993.262207
host_var[17] 924.379883
host_var[18] 839.437012
host_var[19] 810.847656
host_var[20] 835.007813
host_var[21] 1124.365723
host_var[22] 1241.685547
host_var[23] 1376.504150
host_var[24] 1196.745850
host_var[25] 1097.473877
host_var[26] 1008.840088
host_var[27] 867.585083
host_var[28] 794.241699
host_var[29] 1322.409790
host_var[30] 1556.029785
host_var[31] 1564.997803
host_var[32] 1870.985840
host_var[33] 1929.829590
host_var[34] 1822.189453
host_var[35] 1662.321777
host_var[36] 1372.886719
host_var[37] 1074.727539
host_var[38] 833.003906
host_var[39] 632.514648
host_var[40] 380.227539
host_var[41] 87.345703
host_var[42] 82.544922
host_var[43] 78.756836
host_var[44] 68.541016
host_var[45] 61.981445
host_var[46] 60.413086
host_var[47] 60.128906
host_var[48] 59.767578
host_var[49] 59.223633
host_var[50] 56.569336
host_var[51] 53.866211
host_var[52] 51.186523
host_var[53] 55.270508
host_var[54] 59.956055
host_var[55] 66.516602
host_var[56] 70.348633
host_var[57] 71.706055
host_var[58] 70.494141
host_var[59] 69.897461
host_var[60] 66.286133
host_var[61] 67.926758
host_var[62] 160.753906
host_var[63] 447.221191
host_var[64] 831.740723
host_var[65] 1076.513672
host_var[66] 1193.666992
host_var[67] 1208.239746
host_var[68] 1126.845947
host_var[69] 948.397461
host_var[70] 669.399414
host_var[71] 340.465576
host_var[72] 67.161865
host_var[73] 7.421082
host_var[74] 5.485626
host_var[75] 5.135620
host_var[76] 3.460419
host_var[77] 3.853577
host_var[78] 5.221100
host_var[79] 5.890381
host_var[80] 7.139618
host_var[81] 7.517609
host_var[82] 6.865875
host_var[83] 5.053909
host_var[84] 2.781616
host_var[85] 2.021912
host_var[86] 2.130417
host_var[87] 3.113586
host_var[88] 4.024399
host_var[89] 4.582413
host_var[90] 4.077118
host_var[91] 3.024384
host_var[92] 2.287506
host_var[93] 1.793579
host_var[94] 1.567474
host_var[95] 1.829895
host_var[96] 2.325928
host_var[97] 3.429993
host_var[98] 3.885559
host_var[99] 3.835602
host_var[100] 5.566406
host_var[101] 8.065582
host_var[102] 18.767456
host_var[103] 35.395599
host_var[104] 64.148407
host_var[105] 125.937866
host_var[106] 176.445618
host_var[107] 216.073059
host_var[108] 272.109985
host_var[109] 307.972412
host_var[110] 289.652344
host_var[111] 238.253662
host_var[112] 178.304932
host_var[113] 116.925049
host_var[114] 74.773926
host_var[115] 61.227295
host_var[116] 55.238525
host_var[117] 55.387451
host_var[118] 49.241699
host_var[119] 38.396240
host_var[120] 28.304932
host_var[121] 20.225342
host_var[122] 18.043457
host_var[123] 21.418457
host_var[124] 26.120117
host_var[125] 25.899414
host_var[126] 26.641602
host_var[127] 23.747437
host_var[128] 18.927368
host_var[129] 21.664307
host_var[130] 142.432373
host_var[131] 1575.141602
host_var[132] 2901.855957
host_var[133] 4195.149902
host_var[134] 5047.758789
host_var[135] 5450.164063
host_var[136] 5249.767578
host_var[137] 4577.365234
host_var[138] 3352.496094
host_var[139] 1641.593750
host_var[140] 352.242188
host_var[141] 224.824219
host_var[142] 194.578125
host_var[143] 178.875000
host_var[144] 175.148438
host_var[145] 174.117188
host_var[146] 172.707031
host_var[147] 169.578125
host_var[148] 176.308594
host_var[149] 181.968750
host_var[150] 191.507813
host_var[151] 198.500000
host_var[152] 206.824219
host_var[153] 213.273438
host_var[154] 220.312500
host_var[155] 218.859375
host_var[156] 213.941406
host_var[157] 205.474609
host_var[158] 190.722656
host_var[159] 178.414063
host_var[160] 169.302734
host_var[161] 3.750366
host_var[162] 4.333252
host_var[163] 4.901855
host_var[164] 5.527466
host_var[165] 6.201782
host_var[166] 11.921631
host_var[167] 14.135376
host_var[168] 14.885864
host_var[169] 19.083618
host_var[170] 21.290283
host_var[171] 21.415649
host_var[172] 209.747559
host_var[173] 580.304932
host_var[174] 800.949951
host_var[175] 1119.857422
host_var[176] 1129.382324
host_var[177] 1032.616211
host_var[178] 972.797363
host_var[179] 915.440918
host_var[180] 905.890137
host_var[181] 943.649902
host_var[182] 1207.445801
host_var[183] 1345.912109
host_var[184] 1478.704590
host_var[185] 1224.895508
host_var[186] 1105.403564
host_var[187] 1031.981201
host_var[188] 914.456421
host_var[189] 835.127441
host_var[190] 1320.454102
host_var[191] 1561.439941
host_var[192] 1599.149902
host_var[193] 1912.232910
host_var[194] 1993.473145
host_var[195] 1913.377441
host_var[196] 1784.035645
host_var[197] 1554.712891
host_var[198] 1244.698242
host_var[199] 926.668945</p>

<p>

Повторно с включением

<code>
global void kernelMeanVar(d_a, d_mean, d_var,...);</p>

<p>ERROR: no error
host_var[0] 0.000000
host_var[1] 0.000000
host_var[2] 0.000000
host_var[3] 0.000000
host_var[4] 0.000000
host_var[5] 0.000000
host_var[6] 0.000000
host_var[7] 0.000000
host_var[8] 0.000000
host_var[9] 0.000000
host_var[10] 0.000000
host_var[11] 0.000000
host_var[12] 0.000000
host_var[13] 0.000000
host_var[14] 0.000000
host_var[15] 0.000000
host_var[16] 0.000000
host_var[17] 0.000000
host_var[18] 0.000000
host_var[19] 0.000000
host_var[20] 0.000000
host_var[21] 0.000000
host_var[22] 0.000000
host_var[23] 0.000000
host_var[24] 0.000000
host_var[25] 0.000000
host_var[26] 0.000000
host_var[27] 0.000000
host_var[28] 0.000000
host_var[29] 0.000000
host_var[30] 0.000000
host_var[31] 0.000000
host_var[32] 0.000000
host_var[33] 0.000000
host_var[34] 0.000000
host_var[35] 0.000000
host_var[36] 0.000000
host_var[37] 0.000000
host_var[38] 0.000000
host_var[39] 0.000000
host_var[40] 0.000000
host_var[41] 0.000000
host_var[42] 0.000000
host_var[43] 0.000000
host_var[44] 0.000000
host_var[45] 0.000000
host_var[46] 0.000000
host_var[47] 0.000000
host_var[48] 0.000000
host_var[49] 0.000000
host_var[50] 0.000000
host_var[51] 0.000000
host_var[52] 0.000000
host_var[53] 0.000000
host_var[54] 0.000000
host_var[55] 0.000000
host_var[56] 0.000000
host_var[57] 0.000000
host_var[58] 0.000000
host_var[59] 0.000000
host_var[60] 0.000000
host_var[61] 0.000000
host_var[62] 0.000000
host_var[63] 0.000000
host_var[64] 0.000000
host_var[65] 0.000000
host_var[66] 0.000000
host_var[67] 0.000000
host_var[68] 0.000000
host_var[69] 0.000000
host_var[70] 0.000000
host_var[71] 0.000000
host_var[72] 0.000000
host_var[73] 0.000000
host_var[74] 0.000000
host_var[75] 0.000000
host_var[76] 0.000000
host_var[77] 0.000000
host_var[78] 0.000000
host_var[79] 0.000000
host_var[80] 0.000000
host_var[81] 0.000000
host_var[82] 0.000000
host_var[83] 0.000000
host_var[84] 0.000000
host_var[85] 0.000000
host_var[86] 0.000000
host_var[87] 0.000000
host_var[88] 0.000000
host_var[89] 0.000000
host_var[90] 0.000000
host_var[91] 0.000000
host_var[92] 0.000000
host_var[93] 0.000000
host_var[94] 0.000000
host_var[95] 0.000000
host_var[96] 0.000000
host_var[97] 0.000000
host_var[98] 0.000000
host_var[99] 0.000000
host_var[100] 0.000000
host_var[101] 0.000000
host_var[102] 0.000000
host_var[103] 0.000000
host_var[104] 0.000000
host_var[105] 0.000000
host_var[106] 0.000000
host_var[107] 0.000000
host_var[108] 0.000000
host_var[109] 0.000000
host_var[110] 0.000000
host_var[111] 0.000000
host_var[112] 0.000000
host_var[113] 0.000000
host_var[114] 0.000000
host_var[115] 0.000000
host_var[116] 0.000000
host_var[117] 0.000000
host_var[118] 0.000000
host_var[119] 0.000000
host_var[120] 0.000000
host_var[121] 0.000000
host_var[122] 0.000000
host_var[123] 0.000000
host_var[124] 0.000000
host_var[125] 0.000000
host_var[126] 0.000000
host_var[127] 0.000000
host_var[128] 18.927368
host_var[129] 21.664307
host_var[130] 142.432373
host_var[131] 1575.141602
host_var[132] 2901.855957
host_var[133] 4195.149902
host_var[134] 5047.758789
host_var[135] 5450.164063
host_var[136] 5249.767578
host_var[137] 4577.365234
host_var[138] 3352.496094
host_var[139] 1641.593750
host_var[140] 352.242188
host_var[141] 224.824219
host_var[142] 194.578125
host_var[143] 178.875000
host_var[144] 175.148438
host_var[145] 174.117188
host_var[146] 172.707031
host_var[147] 169.578125
host_var[148] 176.308594
host_var[149] 181.968750
host_var[150] 191.507813
host_var[151] 198.500000
host_var[152] 206.824219
host_var[153] 213.273438
host_var[154] 220.312500
host_var[155] 218.859375
host_var[156] 213.941406
host_var[157] 205.474609
host_var[158] 190.722656
host_var[159] 178.414063
host_var[160] 169.302734
host_var[161] 3.750366
host_var[162] 4.333252
host_var[163] 4.901855
host_var[164] 5.527466
host_var[165] 6.201782
host_var[166] 11.921631
host_var[167] 14.135376
host_var[168] 14.885864
host_var[169] 19.083618
host_var[170] 21.290283
host_var[171] 21.415649
host_var[172] 209.747559
host_var[173] 580.304932
host_var[174] 800.949951
host_var[175] 1119.857422
host_var[176] 1129.382324
host_var[177] 1032.616211
host_var[178] 972.797363
host_var[179] 915.440918
host_var[180] 905.890137
host_var[181] 943.649902
host_var[182] 1207.445801
host_var[183] 1345.912109
host_var[184] 1478.704590
host_var[185] 1224.895508
host_var[186] 1105.403564
host_var[187] 1031.981201
host_var[188] 914.456421
host_var[189] 835.127441
host_var[190] 1320.454102
host_var[191] 1561.439941
host_var[192] 1599.149902
host_var[193] 1912.232910
host_var[194] 1993.473145
host_var[195] 1913.377441
host_var[196] 1784.035645
host_var[197] 1554.712891
host_var[198] 1244.698242
host_var[199] 926.668945</p>

<p>

Конец результатов

1 Ответ

2 голосов
/ 08 ноября 2010

Предложения:

  • [Первый приоритет] Проверьте возвращаемые значения из функций CUDA, чтобы увидеть, сообщаются ли какие-либо ошибки.
  • Запустите это через cuda-memcheck. Я не уверен, какова связь между globalRows, globalCols, localRows, localCols, num_elts и т. Д., Но чтение за пределами границ кажется кандидатом для проблем.
  • Помните, что суммирование квадратов может привести к ошибкам округления довольно быстро, если вы не позаботитесь. Рассмотрите возможность использования скользящего среднего / дисперсии или сокращения на основе дерева.
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...