Мне нужно несколько советов, как избежать зависимости в 3 цикла.
Я пробовал openmp и openacc, чтобы увидеть сообщения компилятора, но мне не удалось их решить.
for (int c = 0; c < oCr; c++) // 248, Loop carried reuse of mx-> prevents parallelization // #pragma acc loop seq
for (int i = 0; i < sir[c]; i++) // 249, Loop carried dependence of mx-> prevents parallelization Loop carried backward dependence of mx-> prevents vectorization // #pragma acc loop seq
for (int v = 1; v <= p; v++) // 250, Loop is parallelizable // #pragma acc loop vector(128) /* threadIdx.x */
if (flat2(vj3, p1, b2, b1, ls6j, v))
mx[v - 1] = flat5(ct7, e4, e3, e2, e1, a5, a4, a3, a2, a1, ls6j, c, 5, v, SmM);
int k = 0;
for (int i = ma; i >= mi; i--) // 275, Loop carried reuse of ti-> prevents parallelization Loop carried scalar dependence for k at line 279,278 // 275, #pragma acc loop seq
for (int c = 0; c < (p - nA); c++) // 276, Accelerator restriction: size of the GPU copy of ti,tm is unknown Loop carried reuse of ti-> prevents parallelization Loop carried scalar dependence for k at line 279,278 // 276, #pragma acc loop seq
if ((tm[c]) == i) { // 277
ti[k] = c; // 278
k++; // 279
}
for (int v = 1; v <= p; v++) // 287, omp Loop not vectorized: data dependency Generated 1 prefetches in scalar loop // 287 acc Loop carried reuse of px-> prevents parallelization 287, #pragma acc loop seq
for (int s = 0; s < os; s++) // 288, omp Loop unrolled 4 times (completely unrolled) // acc 288, Accelerator restriction: size of the GPU copy of px is unknown Complex loop carried dependence of px-> prevents parallelization Loop carried reuse of px-> prevents parallelization 288, #pragma acc loop seq
if (flat2(ve1, f1, l2, l1, ls6j, s) == ix[v - 1] + 1) {
int xx;
#pragma omp atomic capture
xx = x++;
flat2(px, os*ol, 0, 0, 0, xx) = v;
flat2(px, os*ol, 0, 0, 1, xx) = ls6j;
}
flat2 и flat5 - макросы для индексов
#define flat2(m0, e1, a2, a1, i2, i1) m0[(i2-a2)*(e1-a1) + (i1-a1)]
#define flat5(m0, e4, e3, e2, e1, a5, a4, a3, a2, a1, i5, i4, i3, i2, i1) m0[(i5-a5)*(e4-a4)*(e3-a3)*(e2-a2)*(e1-a1) + (i4-a4)*(e3-a3)*(e2-a2)*(e1-a1) + (i3-a3)*(e2-a2)*(e1-a1) + (i2-a2)*(e1-a1) + (i1-a1)]
спасибо