Я очень плохо знаком с параллельным программированием. Я работал над проектом класса и должен реализовать гибридную модель, используя openmp и opena cc для вычисления дискретизированного двумерного уравнения Лапласа путем вычисления части строк в процессоре, а остальных - в графическом процессоре.
Компиляция прошла успешно, однако я получаю сообщение об ошибке «FATAL ERROR: переменная в предложении данных частично присутствует на устройстве: name = Tnew».
#include <omp.h>
#include <openacc.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <iostream>
#include <chrono>
#include <iomanip>
using namespace std;
int main(int argc, char *argv[])
//Total size of the grid
int grid_size = atoi(argv[1]);
// a variable to determine the row to split the entire grid between CPU and GPU
int split = atoi(argv[2]);
double * T = new double[(grid_size+2)*(grid_size+2)];
double * Tnew = new double[(grid_size+2)*(grid_size+2)];
double tol = 1e-5;
int nthreads = atoi(argv[3]);
cout << "Grid size is " << grid_size << "number of threads " << nthreads << endl;
//Initialize arrays
for (int i=0; i<grid_size+2; ++i) {
for (int j=0; j<grid_size+2; ++j) {
T[i*(grid_size+2) + j] = 0;
if (0 == i && 0 != j && grid_size+1 != j) { T[i*(grid_size+2) + j] = 100; }
else if (grid_size+1 == i) T[i*(grid_size+2) + j] = 0;
else if (0 == j && 0 != i && grid_size+1 != i) { T[i*(grid_size+2) + j] = 75; }
else if (grid_size+1 == j && 0 != i && grid_size+1 != i) { T[i*(grid_size+2) + j] = 50; }
//Print out array
if (grid_size <= 20) {
for (int i=0; i<grid_size+2; ++i) {
for (int j=0; j<grid_size+2; ++j) {
cout << T[i*(grid_size+2) + j] << '\t';
cout << endl;
double calc_time = omp_get_wtime();
#pragma omp parallel
int tid = omp_get_thread_num();
/* Select the last thread to interact with gpu. Push the contents of array T beggining
from the split location till the end to the gpu
int iteration = 0;
double error = 1.0;
// Copy Rows of T begining from a row before split location till end and copyout split location till the end.
#pragma acc enter data copyin(T[split*(grid_size+2):(grid_size+2)*(grid_size+1)]) create(Tnew[split*(grid_size+2):(grid_size+2)*(grid_size+1)])
while (error > tol && iteration <3000)
error = 0.0;
#pragma acc loop independent reduction(+:error)
for(int a = split+1; a < grid_size+1; a++){
for(int b = 1; b < grid_size+1; b++){
Tnew[a*(grid_size+2)+b] = 0.25 * (T[(a-1)*(grid_size+2)+b]
//error = fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]);
error = fmaxf(error,fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]));
#pragma acc loop independent
for(int ai = split+1; ai < grid_size+1; ai++){
for(int bi = 1; bi < grid_size + 1; bi++){
T[ai*(grid_size+2)+bi] = Tnew[ai*(grid_size+2)+bi];
// Update the gpu's boundary row in main memory
#pragma acc update self(T[(split+1)*(grid_size+2):((split+1)*(grid_size+2)+ grid_size)])
// Update the threads boundary row in GPU
#pragma acc update device(T[(split)*(grid_size+2):(split*(grid_size+2)+ grid_size)])
#pragma acc exit data copyout(T[(split+1)*(grid_size+2):(grid_size+2)*(grid_size+1)])
cout << "GPU Portion Completed" << iteration << " Iterations" << endl;
// The first N rows until the split location gets computed by the rest of omp threads
double error = 1.0;
int iteration = 0;
while (error > tol && iteration <3000) {
#pragma omp for collapse(2) nowait
//#pragma acc kernels
for(int a = 1; a < split+1; a++){
for(int b = 1; b < grid_size+1; b++){
Tnew[a*(grid_size+2)+b] = 0.25 * (T[(a-1)*(grid_size+2)+b]
error = fmaxf(error,fabs(Tnew[a*(grid_size+2)+b]-T[a*(grid_size+2)+b]));
#pragma omp for collapse(2) nowait
for(int ai = 1; ai < split+1; ai++){
for(int bi = 1; bi < grid_size + 1; bi++){
T[ai*(grid_size+2)+bi] = Tnew[ai*(grid_size+2)+bi];
calc_time = omp_get_wtime() - calc_time;
cout << "calc time " << calc_time << endl;
if (grid_size <= 20) {
for (int i=0; i<grid_size+2; ++i) {
for (int j=0; j<grid_size+2; ++j) {
cout << setprecision(5) << T[i*(grid_size+2) + j] << '\t';
cout << endl;
delete [] T;
delete [] Tnew;
Ниже приведено сообщение, которое я получаю при компиляции
pgc++ -mp -acc -Minfo mixed_omp_acc.cpp -o omp_acc
7, include "iostream"
35, include "iostream"
4, include "ostream"
38, include "ios"
44, include "basic_ios.h"
53, Parallel region activated
128, Parallel region terminated
64, Generating copyout(T[(grid_size+1)*(split+1):(grid_size+2)*(grid_size+1)]) [if not already present]
Generating create(iteration) [if not already present]
Generating copyin(tol) [if not already present]
Generating create(Tnew[split:(grid_size+2)*(grid_size+1)]) [if not already present]
Generating copyout(T[(grid_size+2)*split:(grid_size+2)*(grid_size+1)]) [if not already present]
Generating copyin(error) [if not already present]
94, Generating update self(T[(grid_size+2)*(split+1):(grid_size+2)*(grid_size+1)])
Generating update device(T[(grid_size+2)*split:(grid_size+2)*(grid_size+1)])
106, Parallel loop activated with static block schedule
114, Barrier
118, Parallel loop activated with static block schedule
122, Barrier
Ниже приведены ошибки, которые я получаю при запуске.
Первый аргумент - размер сетки, второй - индекс деления между openmp и opena cc, а третий - число нитей процессора. Я попытался назначить последний поток процессора для взаимодействия с графическим процессором.
T lives at 0x8cc130 size 3696 partially present
Present table dump for device[1]: NVIDIA Tesla GPU 0, compute capability 7.0, threadid=1
host:0x8cc080 device:0x7f09b3afa000 size:3696 presentcount:0+1 line:69 name:T
host:0x8ccfb0 device:0x7f09b3afb000 size:3696 presentcount:0+1 line:69 name:Tnew
allocated block device:0x7f09b3afa000 size:4096 thread:1
allocated block device:0x7f09b3afb000 size:4096 thread:1
FATAL ERROR: variable in data clause is partially present on the device: name=T
******* mixed_omp_acc.cpp main_1F252L55 line:106