Происходит несколько проблем.
Во-первых, вы не можете выполнять строковые операции или файловый ввод-вывод с устройства. Этого не хватало в вашем исходном коде. Вам нужно переосмыслить способ хранения результатов в файле. Пока я просто отключил его с помощью макроса, когда используется OpenACC.
Во-вторых, вы используете векторы. Векторы можно использовать, но проблематично. Они не поточнобезопасны и действительно являются классом с тремя указателями. Оператор Data выполняет поверхностное копирование, поэтому копирует только указатели, а не данные, на которые они указывают. Хотя это и сложно, вы можете выполнить ручное глубокое копирование или использовать унифицированную память CUDA (-ta = tesla: managed), но я просто сделал их регулярными распределенными массивами, поскольку вам не нужно, чтобы они были векторами.
Наконец, ваша производительность будет очень плохой. «nx» - только 10, поэтому код сильно использует устройство. Вам нужно, чтобы nx составлял тысячи или сотни тысяч, чтобы увидеть реальную выгоду от графического процессора.
Кроме того, использование вашего регистра довольно велико из-за использования значительного числа локальных переменных. Высокое использование регистра приводит к низкой загруженности, что часто может привести к снижению производительности. Хотя единственный способ помочь в этом - разбить большой цикл на несколько маленьких циклов, сохраняя промежуточные результаты в глобальных массивах. Тем не менее, вам может понадобиться сделать что-то подобное в любом случае для решения проблемы ввода-вывода файла.
% cat pcc.cpp
#include<iostream>
#include<math.h>
#include<fstream>
#include<cstdlib>
#include<iomanip>
#include<ctime>
#include<vector>
//Compile using pgc++ -acc pcc.cpp -Minfo=accel -ta=tesla:cuda9.2
using namespace std;
typedef std::vector<double> RealVector;
double Q_pred(double,double,double,double,double);
double V_pred(double,double,double,double);
double A_1(double,double,double,double,double,double,double,double,double);
double A_2(double,double,double,double,double,double,double,double,double);
double A_3(double,double,double,double,double,double,double,double,double);
double A_4(double,double,double,double,double,double,double,double,double);
double J_1(double,double,double,double,double,double,double,double,double,double,double, double,double);
double J_2(double,double,double,double,double,double,double,double,double,double,double, double,double);
double J_3(double,double,double,double,double,double,double,double,double,double,double, double,double);
double J_4(double,double,double,double,double,double,double,double,double,double,double, double,double);
void Qpos(double&,double&,double,double);
void Qneg(double&,double&,double,double);
void R(double&,double,double);
void Energ(double&,double,double,double,double,double,double,double,double);
void r_12(double&,double,double,double,double);
void dr_12(double&,double,double,double,double,double,double,double,double);
void dTao(double&,double,double,double,double,double,double,double,double,double,double,double,double,double,double,double,double);
double X1_pos(double,double);
double X1_neg(double,double);
int main(){
int n0,nx,N,fac;
double x1,x2,y1,y2,x2_ini,x2_fin;
double px1,px2,py1,py2;
double Q1,Q2,Q3,Q4;
double V1,V2,V3,V4;
double A1,A2,A3,A4;
double J1,J2,J3,J4;
double Q1_,Q2_,Q3_,Q4_;
double V1_,V2_,V3_,V4_;
double A1_,A2_,A3_,A4_;
double J1_,J2_,J3_,J4_;
double P1,P2,P3,P4;
double dtao,R1,R2,r12,dr12,r12_,dr12_;
double dt,t,E,L;
ofstream points,graph;
ofstream electron1,electron2;
string x20;
nx = 10;
n0 = 2097152;
N = 262144;
fac=n0/N;
x2_ini = 7.824;
x2_fin = 10.;
L = 0.28284271247461900976;
// RealVector X1(N),Y1(N),X2(N),Y2(N),T(N),T_(N);
double *X1 = new double[N];
double *Y1 = new double[N];
double *X2 = new double[N];
double *Y2 = new double[N];
double *T = new double[N];
double *T_ = new double[N];
#pragma acc parallel loop copyout(X1[:N],X2[:N],Y1[:N],Y2[:N],T[:N],T_[:N])
for(int j=0;j<nx;j++){
t= 0.;
x2 = x2_ini+abs(x2_ini-x2_fin)*j/(nx-1);
y2= 0.;
px2 = 0.;
py2 = L/x2;
x1 = X1_neg(x2,L);
y1 = 0.;
px1= 0.;
py1 = 0.;
#ifndef _OPENACC
x20=to_string(x2);
#endif
#pragma acc loop seq
for(int i=0;i<n0;i++){
if(i%fac==0){
T[i/fac]=t;
X1[i/fac]=x1;
Y1[i/fac]=y1;
X2[i/fac]=x2;
Y2[i/fac]=y2;
}
if(x1>=0){
Qpos(Q1, Q2, x1, y1);
}else
Qneg(Q1, Q2, x1, y1);
if(x2>=0){
Qpos(Q3, Q4, x2, y2);
}else
Qneg(Q3, Q4, x2, y2);
P1=2.*(px1*Q1+py1*Q2);
P2=2.*(py1*Q1-px1*Q2);
P3=2.*(px2*Q3+py2*Q4);
P4=2.*(py2*Q3-px2*Q4);
R(R1,Q1,Q2);
R(R2,Q3,Q4);
Energ(E,x1,x2,y1,y2,px1,px2,py1,py2);
V1=(Q3*Q3+Q4*Q4)*P1/4.;
V2=(Q3*Q3+Q4*Q4)*P2/4.;
V3=(Q1*Q1+Q2*Q2)*P3/4.;
V4=(Q1*Q1+Q2*Q2)*P4/4.;
r_12(r12,Q1,Q2,Q3,Q4);
dr_12(dr12,Q1,Q2,Q3,Q4,V1,V2,V3,V4);
//
A1=A_1(Q1,Q2,Q3,Q4,V1,V3,V4,r12,E);
A2=A_2(Q1,Q2,Q3,Q4,V2,V3,V4,r12,E);
A3=A_3(Q1,Q2,Q3,Q4,V1,V2,V3,r12,E);
A4=A_4(Q1,Q2,Q3,Q4,V1,V2,V4,r12,E);
J1=J_1(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A3,A4,r12,dr12,E);
J2=J_2(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A3,A4,r12,dr12,E);
J3=J_3(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A1,A2,r12,dr12,E);
J4=J_4(Q1,Q2,Q3,Q4,V1,V2,V3,V4,A1,A2,r12,dr12,E);
dTao(dtao,Q1,Q2,Q3,Q4,V1,V2,V3,V4,A1,A2,A3,A4,J1,J2,J3,J4);
Q1_=Q_pred(Q1,V1,A1,J1,dtao);
Q2_=Q_pred(Q2,V2,A2,J2,dtao);
Q3_=Q_pred(Q3,V3,A3,J3,dtao);
Q4_=Q_pred(Q4,V4,A4,J4,dtao);
V1_=V_pred(V1,A1,J1,dtao);
V2_=V_pred(V2,A2,J2,dtao);
V3_=V_pred(V3,A3,J3,dtao);
V4_=V_pred(V4,A4,J4,dtao);
r_12(r12_,Q1_,Q2_,Q3_,Q4_);
dr_12(dr12_,Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_);
A1_=A_1(Q1_,Q2_,Q3_,Q4_,V1_,V3_,V4_,r12_,E);
A2_=A_2(Q1_,Q2_,Q3_,Q4_,V2_,V3_,V4_,r12_,E);
A3_=A_3(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,r12_,E);
A4_=A_4(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V4_,r12_,E);
J1_=J_1(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A3_,A4_,r12_,dr12_,E);
J2_=J_2(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A3_,A4_,r12_,dr12_,E);
J3_=J_3(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A1_,A2_,r12_,dr12_,E);
J4_=J_4(Q1_,Q2_,Q3_,Q4_,V1_,V2_,V3_,V4_,A1_,A2_,r12_,dr12_,E);
//
V1_=V1+(A1_+A1)*dtao/2.-(J1_-J1)*dtao*dtao/12.;
V2_=V2+(A2_+A2)*dtao/2.-(J2_-J2)*dtao*dtao/12.;
V3_=V3+(A3_+A3)*dtao/2.-(J3_-J3)*dtao*dtao/12.;
V4_=V4+(A4_+A4)*dtao/2.-(J4_-J4)*dtao*dtao/12.;
Q1_=Q1+(V1_+V1)*dtao/2.-(A1_-A1)*dtao*dtao/10.+(J1_+J1)*dtao*dtao*dtao/120.;
Q2_=Q2+(V2_+V2)*dtao/2.-(A2_-A2)*dtao*dtao/10.+(J2_+J2)*dtao*dtao*dtao/120.;
Q3_=Q3+(V3_+V3)*dtao/2.-(A3_-A3)*dtao*dtao/10.+(J3_+J3)*dtao*dtao*dtao/120.;
Q4_=Q4+(V4_+V4)*dtao/2.-(A4_-A4)*dtao*dtao/10.+(J4_+J4)*dtao*dtao*dtao/120.;
Q1=Q1_;
Q2=Q2_;
Q3=Q3_;
Q4=Q4_;
V1=V1_;
V2=V2_;
V3=V3_;
V4=V4_;
P1=4.*V1/(Q3*Q3+Q4*Q4);
P2=4.*V2/(Q3*Q3+Q4*Q4);
P3=4.*V3/(Q1*Q1+Q2*Q2);
P4=4.*V4/(Q1*Q1+Q2*Q2);
dt=R1*R1*R2*R2*dtao;
t=t+dt;
x1=Q1*Q1-Q2*Q2;
x2=Q3*Q3-Q4*Q4;
y1=2.*Q1*Q2;
y2=2.*Q3*Q4;
px1=(Q1*P1-Q2*P2)/(2.*(Q1*Q1+Q2*Q2));
py1=(Q2*P1+Q1*P2)/(2.*(Q1*Q1+Q2*Q2));
px2=(Q3*P3-Q4*P4)/(2.*(Q3*Q3+Q4*Q4));
py2=(Q4*P3+Q3*P4)/(2.*(Q3*Q3+Q4*Q4));
}
dt=t/(N-1);
#ifndef _OPENACC
for(int i=0; i<N; i++)
T_[i]=i*dt;
electron1.open(("electron1_neg_x20_"+x20+".dat").c_str());
electron2.open(("electron2_neg_x20_"+x20+".dat").c_str());
for(int i=0; i<N ;i++){
electron1<<T_[i]<<" "<<X1[i]<<" "<<Y1[i]<<endl;
electron2<<T_[i]<<" "<<X2[i]<<" "<<Y2[i]<<endl;
}
electron1.close();
electron2.close();
#endif
}
delete [] X1;
delete [] X2;
delete [] Y1;
delete [] Y2;
delete [] T;
delete [] T_;
return 0;
}
double Q_pred(double Q, double V, double A, double J, double dtao){
return Q+V*dtao+A*dtao*dtao/2.+J*dtao*dtao*dtao/6.;
}
double V_pred(double V, double A, double J, double dtao){
return V+A*dtao+J*dtao*dtao/2.;
}
double A_1(double Q1, double Q2, double Q3, double Q4, double V1, double V3, double V4,double r12, double E){
return 2.*(Q3*V3+Q4*V4)*V1/(Q3*Q3+Q4*Q4)+(Q3*Q3+Q4*Q4)*(-(4.*Q1*(V3*V3+V4*V4)/pow(Q1*Q1+Q2*Q2,2.)-4.*Q1+2.*Q1*(Q3*Q3+Q4*Q4)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q1*Q1+Q2*Q2)*Q1+(Q4*Q4-Q3*Q3)*Q1-2.*Q2*Q3*Q4)/pow(r12,3.)))/4.;
}
double A_2(double Q1, double Q2, double Q3, double Q4, double V2, double V3, double V4,double r12, double E){
return 2.*(Q3*V3+Q4*V4)*V2/(Q3*Q3+Q4*Q4)+(Q3*Q3+Q4*Q4)*(-4.*Q2*(V3*V3+V4*V4)/pow(Q1*Q1+Q2*Q2,2.)+4.*Q2-2.*Q2*(Q3*Q3+Q4*Q4)*(-E+1./r12)+2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q1*Q1+Q2*Q2)*Q2-(Q4*Q4-Q3*Q3)*Q2-2.*Q1*Q3*Q4)/pow(r12,3.))/4.;
}
double A_3(double Q1, double Q2, double Q3, double Q4, double V1, double V2, double V3,double r12, double E){
return 2.*(Q1*V1+Q2*V2)*V3/(Q1*Q1+Q2*Q2)+(Q1*Q1+Q2*Q2)*(-4.*Q3*(V1*V1+V2*V2)/pow(Q3*Q3+Q4*Q4,2.)+4.*Q3-2.*Q3*(Q1*Q1+Q2*Q2)*(-E+1./r12)+2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q3*Q3+Q4*Q4)*Q3+(Q2*Q2-Q1*Q1)*Q3-2.*Q1*Q2*Q4)/pow(r12,3.))/4.;
}
double A_4(double Q1, double Q2, double Q3, double Q4, double V1, double V2, double V4,double r12, double E){
return 2.*(Q1*V1+Q2*V2)*V4/(Q1*Q1+Q2*Q2)+(Q1*Q1+Q2*Q2)*(-4.*Q4*(V1*V1+V2*V2)/pow(Q3*Q3+Q4*Q4,2.)+4.*Q4-2.*Q4*(Q1*Q1+Q2*Q2)*(-E+1./r12)+2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q3*Q3+Q4*Q4)*Q4-(Q2*Q2-Q1*Q1)*Q4-2.*Q1*Q2*Q3)/pow(r12,3.))/4.;
}
double J_1(double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4, double A3, double A4, double r12,double dr12 ,double E){
return 2.*(V3*V3+V4*V4+Q3*A3+Q4*A4)*V1/(Q3*Q3+Q4*Q4)+(Q3*V3+Q4*V4)*(-(4.*Q1*(V3*V3+V4*V4)/pow(Q1*Q1+Q2*Q2,2.)-4.*Q1+2.*Q1*(Q3*Q3+Q4*Q4)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q1*Q1+Q2*Q2)*Q1+(Q4*Q4-Q3*Q3)*Q1-2.*Q2*Q3*Q4)/pow(r12,3.)))+(Q3*Q3+Q4*Q4)*(4 *(V1 - ((V3*V3 + V4*V4) *V1)/pow(Q1*Q1 + Q2*Q2,2.) + (4 *Q1 *(V3*V3 + V4*V4) *(Q1 *V1 + Q2 *V2))/pow(Q1*Q1 + Q2*Q2,3.) - (2 *Q1 *(V3 *A3 + V4 *A4))/pow(Q1*Q1 + Q2*Q2,2.))+(1./(r12*r12))*(2 *r12 *(-1 + E *r12) *(Q3*Q3 *V1 +2 *Q1 *Q3 *V3 + Q4 *(Q4 *V1 + 2 *Q1 *V4)) + 2 *Q1 *(Q3*Q3 + Q4*Q4)* dr12)+ (1./(pow(r12,4)))*2 *(2 *(Q3*Q3 + Q4*Q4) *(Q1*Q1*Q1 - 2 *Q2 *Q3 *Q4 + Q1 *(Q2*Q2 - Q3*Q3 + Q4*Q4)) *r12 *(Q1 *V1 + Q2 *V2) + 2 *(Q1*Q1 + Q2*Q2) *(Q1*Q1*Q1 - 2 *Q2 *Q3 *Q4 + Q1 *(Q2*Q2 - Q3*Q3 + Q4*Q4)) *r12 *(Q3 *V3 + Q4 *V4) + (Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *r12 *(3 *Q1*Q1 *V1 + Q2*Q2 *V1 + (-Q3*Q3 + Q4*Q4) *V1 - 2 *Q3 *Q4 *V2 - 2 *Q2 *(Q4 *V3 + Q3 *V4) + 2 *Q1 *(Q2 *V2 - Q3 *V3 + Q4 *V4)) - 3 *(Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *(Q1*Q1*Q1 - 2 *Q2 *Q3 *Q4 + Q1 *(Q2*Q2 - Q3*Q3 + Q4*Q4))* dr12))/4.;
}
double J_2(double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4, double A3, double A4, double r12,double dr12 , double E){
return 2.*(V3*V3+V4*V4+Q3*A3+Q4*A4)*V2/(Q3*Q3+Q4*Q4)+(Q3*V3+Q4*V4)*(-(4.*Q2*(V3*V3+V4*V4)/pow(Q1*Q1+Q2*Q2,2.)-4.*Q2+2.*Q2*(Q3*Q3+Q4*Q4)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q1*Q1+Q2*Q2)*Q2-(Q4*Q4-Q3*Q3)*Q2-2.*Q1*Q3*Q4)/pow(r12,3.)))+(Q3*Q3+Q4*Q4)*(4 *(V2 - ((V3*V3 + V4*V4) *V2)/pow(Q1*Q1 + Q2*Q2,2.) + (4 *Q2 *(V3*V3 + V4*V4) *(Q1 *V1 + Q2 *V2))/pow(Q1*Q1 + Q2*Q2,3.) - ( 2 *Q2 *(V3 *A3 + V4 *A4))/pow(Q1*Q1 + Q2*Q2,2.))+(1./(r12*r12))*(2 *r12 *(-1 + E *r12) *(Q3*Q3 *V2 + 2 *Q2 *Q3 *V3 + Q4 *(Q4 *V2 + 2 *Q2 *V4)) + 2 *Q2 *(Q3*Q3 + Q4*Q4) *dr12) + (1./(pow(r12,4)))*2 *(2 *(Q3*Q3 + Q4*Q4) *(Q2 *(Q1*Q1 + Q2*Q2 + Q3*Q3) - 2 *Q1 *Q3 *Q4 - Q2 *Q4*Q4) *r12 *(Q1 *V1 + Q2 *V2) + 2 *(Q1*Q1 + Q2*Q2) *(Q2 *(Q1*Q1 + Q2*Q2 + Q3*Q3) -2 *Q1 *Q3 *Q4 - Q2 *Q4*Q4) *r12 *(Q3 *V3 + Q4 *V4) + (Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *r12 *(Q1*Q1 *V2 + Q3*Q3 *V2 + (3 *Q2*Q2 - Q4*Q4) *V2 +Q3 *(-2 *Q4 *V1 + 2 *Q2 *V3) -2 *Q2 *Q4 *V4 + 2 *Q1 *(Q2 *V1 - Q4 *V3 - Q3 *V4)) - 3 *(Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *(Q2 *(Q1*Q1 + Q2*Q2 + Q3*Q3) - 2 *Q1 *Q3 *Q4 - Q2 *Q4*Q4)* dr12))/4.;
}
double J_3(double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4, double A1, double A2, double r12,double dr12 , double E){
return 2.*(V1*V1+V2*V2+Q1*A1+Q2*A2)*V3/(Q1*Q1+Q2*Q2)+(Q1*V1+Q2*V2)*(-(4.*Q3*(V1*V1+V2*V2)/pow(Q3*Q3+Q4*Q4,2.)-4.*Q3+2.*Q3*(Q1*Q1+Q2*Q2)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q3*Q3+Q4*Q4)*Q3+(Q2*Q2-Q1*Q1)*Q3-2.*Q1*Q2*Q4)/pow(r12,3.)))+(Q1*Q1+Q2*Q2)*(4 *(V3 - ((V1*V1 + V2*V2) *V3)/pow(Q3*Q3 + Q4*Q4,2.) + (4 *Q3 *(V1*V1 + V2*V2) *(Q3 *V3 + Q4 *V4))/pow(Q3*Q3 + Q4*Q4,3.) - (2 *Q3 *(V1 *A1 + V2 *A2))/pow(Q3*Q3 + Q4*Q4,2.))+ (1./(r12*r12))*(2 *r12 *(-1 + E *r12) *(2 *Q3 *(Q1 *V1 + Q2 *V2) + (Q1*Q1 + Q2*Q2) *V3) + 2 *(Q1*Q1 + Q2*Q2) *Q3 *dr12) + (1./(pow(r12,4.)))*2 *(2 *(Q3*Q3 + Q4*Q4) *(-Q1*Q1 *Q3 - 2 *Q1 *Q2 *Q4 + Q3 *(Q2*Q2 + Q3*Q3 + Q4*Q4)) *r12 *(Q1 *V1 + Q2 *V2) + 2 *(Q1*Q1 + Q2*Q2) *(-Q1*Q1 *Q3 - 2 *Q1 *Q2 *Q4 + Q3 *(Q2*Q2 + Q3*Q3 + Q4*Q4)) *r12 *(Q3 *V3 + Q4 *V4) + (Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *r12 *(Q2 *(-2 *Q4 *V1 + 2 *Q3 *V2) - Q1*Q1 *V3 + Q2*Q2 *V3 + (3 *Q3*Q3 + Q4*Q4) *V3 + 2 *Q3 *Q4 *V4 - 2 *Q1 *(Q3 *V1 + Q4 *V2 + Q2 *V4)) - 3 *(Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *(-Q1*Q1 *Q3 - 2 *Q1 *Q2 *Q4 + Q3 *(Q2*Q2 + Q3*Q3 + Q4*Q4)) *dr12))/4.;
}
double J_4(double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4, double A1, double A2, double r12,double dr12 , double E){
return 2.*(V1*V1+V2*V2+Q1*A1+Q2*A2)*V4/(Q1*Q1+Q2*Q2)+(Q1*V1+Q2*V2)*(-(4.*Q4*(V1*V1+V2*V2)/pow(Q3*Q3+Q4*Q4,2.)-4.*Q4+2.*Q4*(Q1*Q1+Q2*Q2)*(-E+1./r12)-2.*(Q1*Q1+Q2*Q2)*(Q3*Q3+Q4*Q4)*((Q3*Q3+Q4*Q4)*Q4-(Q2*Q2-Q1*Q1)*Q4-2.*Q1*Q2*Q3)/pow(r12,3.)))+(Q1*Q1+Q2*Q2)*(4 *(V4 - ((V1*V1 + V2*V2) *V4)/pow(Q3*Q3 + Q4*Q4,2.) + (4 *Q4 *(V1*V1 + V2*V2) *(Q3 *V3 + Q4 *V4))/pow(Q3*Q3 + Q4*Q4,3.) - ( 2 *Q4 *(V1 *A1 + V2 *A2))/pow(Q3*Q3 + Q4*Q4,2.))+ (1./(r12*r12))*(-2 *r12 *(-1 + E* r12) *(2 *Q4 *(Q1 *V1 + Q2 *V2) + (Q1*Q1 + Q2*Q2) *V4) - 2 *(Q1*Q1 + Q2*Q2) *Q4 *dr12) + (1./(pow(r12,4)))*2 *(2 *(Q3*Q3 + Q4*Q4) *(-2 *Q1 *Q2 *Q3 + (Q1*Q1 - Q2*Q2 + Q3*Q3) *Q4 + Q4*Q4*Q4) *r12 *(Q1 *V1 + Q2 *V2) + 2 *(Q1*Q1 + Q2*Q2) *(-2 *Q1 *Q2 *Q3 + (Q1*Q1 - Q2*Q2 + Q3*Q3) *Q4 + Q4*Q4*Q4) *r12 *(Q3 *V3 + Q4 *V4) + (Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *r12 *(2 *Q1 *(Q4 *V1 -Q3 *V2) + 2 *Q3 *Q4 *V3 - 2 *Q2 *(Q3 *V1 +Q4 *V2 + Q1 *V3) + (Q1*Q1 - Q2*Q2 + Q3*Q3 + 3 *Q4*Q4) *V4) - 3 *(Q1*Q1 + Q2*Q2) *(Q3*Q3 + Q4*Q4) *(-2 *Q1 *Q2 *Q3 + (Q1*Q1 - Q2*Q2 + Q3*Q3) *Q4 + Q4*Q4*Q4) *dr12))/4.;
}
void Qpos(double& Qx, double& Qy, double x, double y){
Qx=pow(0.5*(pow(x*x+y*y,0.5)+x),0.5);
Qy=y/(2.*pow(0.5*(pow(x*x+y*y,0.5)+x),0.5));
return;
}
void Qneg(double& Qx, double& Qy, double x, double y){
Qy=pow(0.5*(pow(x*x+y*y,0.5)-x),0.5);
Qx=y/(2.*pow(0.5*(pow(x*x+y*y,0.5)-x),0.5));
return;
}
void R(double& Ri, double Qx, double Qy){
Ri=pow(Qx*Qx+Qy*Qy,0.5);
return;
}
void Energ(double& E, double x1, double x2, double y1, double y2, double px1, double px2, double py1, double py2){
E=(px1*px1+py1*py1)/2.+(px2*px2+py2*py2)/2.-2./pow(x1*x1+y1*y1,0.5)-2./pow(x2*x2+y2*y2,0.5)+1./pow(pow(x1-x2,2.)+pow(y1-y2,2.),0.5);
return;
}
void r_12(double& r12,double Q1, double Q2, double Q3, double Q4){
r12=pow(pow(Q1*Q1+Q2*Q2,2.)+pow(Q3*Q3+Q4*Q4,2.)-2.*pow(Q1*Q3+Q2*Q4,2.)+2.*pow(Q1*Q4-Q2*Q3,2.),0.5);
return;
}
void dr_12(double& dr12,double Q1, double Q2, double Q3, double Q4, double V1,double V2, double V3, double V4){
dr12=(2.*((Q1*Q1 + Q2*Q2)*(Q1 *V1 + Q2 *V2) + (Q2 *Q3 - Q1 *Q4) *(-Q4 *V1 + Q3 *V2 + Q2 *V3 - Q1 *V4) - (Q1 *Q3 + Q2 *Q4) *(Q3 *V1 + Q4 *V2 + Q1 *V3 + Q2 *V4) + (Q3*Q3 +Q4*Q4) *(Q3 *V3 + Q4 *V4)))/pow(pow(Q1*Q1 + Q2*Q2,2) + 2*pow(Q2 *Q3 - Q1 *Q4,2.) -2 *pow(Q1 *Q3 + Q2 *Q4,2.) + pow(Q3*Q3 + Q4*Q4,2.),0.5);
return;
}
void dTao(double& dtao,double Q1,double Q2,double Q3,double Q4,double V1,double V2,double V3,double V4,double A1,double A2,double A3,double A4,double J1,double J2,double J3,double J4){
double modQ=Q1*Q1+Q2*Q2+Q3*Q3+Q4*Q4;
double modV=V1*V1+V2*V2+V3*V3+V4*V4;
double modA=A1*A1+A2*A2+A3*A3+A4*A4;
double modJ=J1*J1+J2*J2+J3*J3+J4*J4;
dtao=pow(0.0000001*(modA*modQ+modV*modV)/(modJ*modV+modA*modA),0.5);
return;
}
double X1_pos(double x2, double L){
return (-2. + L*L/x2 + 2.*x2 + pow(L*L*L*L - 20.*L*L*x2 + 68.*x2*x2 + 4.*L*L *x2*x2 - 40.*x2*x2*x2 + 4. *x2*x2*x2*x2,0.5)/x2)/(2.*(2.+ L*L/(x2*x2) - 4./x2));
}
double X1_neg(double x2, double L){
return (-2. + L*L/x2 + 2.*x2 - pow(L*L*L*L - 20.*L*L*x2 + 68.*x2*x2 + 4.*L*L *x2*x2 - 40.*x2*x2*x2 + 4. *x2*x2*x2*x2,0.5)/x2)/(2.*(2.+ L*L/(x2*x2) - 4./x2));
}
% pgc++ -ta=tesla -Minfo=accel pcc.cpp -o pcc.exe ; ./pcc.exe
main:
74, Generating copyout(T_[:N],X1[:N],X2[:N],Y2[:N],Y1[:N],T[:N])
Generating Tesla code
77, #pragma acc loop gang, vector(10) /* blockIdx.x threadIdx.x */
94, #pragma acc loop seq
94, Complex loop carried dependence of T->,X1-> prevents parallelization
Loop carried dependence of T-> prevents parallelization
Loop carried backward dependence of T-> prevents vectorization
Complex loop carried dependence of Y1-> prevents parallelization
Loop carried dependence of X2-> prevents parallelization
Loop carried backward dependence of X2-> prevents vectorization
Complex loop carried dependence of X2-> prevents parallelization
Loop carried dependence of X1-> prevents parallelization
Loop carried backward dependence of X1-> prevents vectorization
Loop carried dependence of Y2->,Y1-> prevents parallelization
Loop carried backward dependence of Y2->,Y1-> prevents vectorization
Complex loop carried dependence of Y2-> prevents parallelization
Loop carried scalar dependence for y2 at line 111
Loop carried scalar dependence for py1 at line 113
Loop carried scalar dependence for py2 at line 116
Loop carried scalar dependence for Q4 at line 122
Loop carried scalar dependence for Q2 at line 123,124
Loop carried scalar dependence for Q4 at line 126,127,131,132,133,134,135,136,137,138,140,145
Loop carried scalar dependence for Q1 at line 113,114,117,123,124,126,127,131,132,133,134,135,136,137,138,140,142,171
Loop carried scalar dependence for py1 at line 114
Loop carried scalar dependence for py2 at line 115
Loop carried scalar dependence for Q2 at line 117
Loop carried scalar dependence for y2 at line 119
Loop carried scalar dependence for Q4 at line 121
Loop carried scalar dependence for Q2 at line 126,127,131,132,133,134,135,136,137,138,140,143,172
Loop carried scalar dependence for Q3 at line 115,116,118,121,122,126,127,131,132,133,134,135,136,137,138,140,144
Loop carried scalar dependence for Q4 at line 174
Loop carried scalar dependence for Q3 at line 173
Loop carried scalar dependence for t at line 96
Loop carried scalar dependence for y2 at line 100
Loop carried scalar dependence for x1 at line 103
Loop carried scalar dependence for y1 at line 104
Loop carried scalar dependence for py2 at line 119
Loop carried scalar dependence for y1 at line 98,106
Loop carried scalar dependence for x2 at line 108
Loop carried scalar dependence for y2 at line 109
Loop carried scalar dependence for x1 at line 119,97,104,106
Loop carried scalar dependence for x2 at line 119,99,109,111
Loop carried scalar dependence for y1 at line 119
Loop carried scalar dependence for Q2 at line 113,114
Loop carried scalar dependence for Q4 at line 118
Loop carried scalar dependence for px1 at line 113,114
Loop carried scalar dependence for px2 at line 119
Loop carried scalar dependence for Q4 at line 115,116
Loop carried scalar dependence for px1 at line 119
Loop carried scalar dependence for px2 at line 115,116
Loop carried scalar dependence for py1 at line 119
Q_pred(double, double, double, double, double):
240, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
V_pred(double, double, double, double):
244, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
A_1(double, double, double, double, double, double, double, double, double):
248, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
A_2(double, double, double, double, double, double, double, double, double):
252, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
A_3(double, double, double, double, double, double, double, double, double):
256, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
A_4(double, double, double, double, double, double, double, double, double):
260, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
J_1(double, double, double, double, double, double, double, double, double, double, double, double, double):
264, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
J_2(double, double, double, double, double, double, double, double, double, double, double, double, double):
268, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
J_3(double, double, double, double, double, double, double, double, double, double, double, double, double):
272, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
J_4(double, double, double, double, double, double, double, double, double, double, double, double, double):
276, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
Qpos(double &, double &, double, double):
282, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
Qneg(double &, double &, double, double):
288, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
R(double &, double, double):
294, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
Energ(double &, double, double, double, double, double, double, double, double):
299, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
r_12(double &, double, double, double, double):
306, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
dr_12(double &, double, double, double, double, double, double, double, double):
313, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
dTao(double &, double, double, double, double, double, double, double, double, double, double, double, double, double, double, double, double):
320, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
X1_neg(double, double):
338, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
std::abs(double):
1, include "iostream"
35, include "iostream"
4, include "ostream"
38, include "ios"
42, include "ios_base.h"
41, include "locale_classes.h"
40, include "string"
52, include "basic_string.h"
6391, include "string_conversions.h"
41, include "cstdlib"
77, include "std_abs.h"
71, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
__gnu_cxx::__promote_2<T1, T2, __gnu_cxx::__promote<T1, std::__is_integer<T1>::__value>::__type, __gnu_cxx::__promote<T2, std::__is_integer<T2>::__value>::__type>::__type std::pow<double, int>(T1, T2):
2, include "math.h"
59, include "math.h"
36, include "cmath"
416, Generating implicit acc routine seq
Generating acc routine seq
Generating Tesla code
Accelerator Kernel Timing data
pcc.cpp
main NVIDIA devicenum=0
time(us): 35,476,363
74: compute region reached 1 time
74: kernel launched 1 time
grid: [1] block: [10]
device time(us): total=35,475,345 max=35,475,345 min=35,475,345 avg=35,475,345
elapsed time(us): total=35,475,684 max=35,475,684 min=35,475,684 avg=35,475,684
74: data region reached 2 times
227: data copyout transfers: 6
device time(us): total=1,018 max=186 min=166 avg=169