Я перевожу код из устаревшего cblas / lapacke в cudaBLAS / cudaSOLVER и у меня возникли некоторые проблемы. Я сделал тестовую программу, чтобы разобраться в этом. Прикрепленный код, который я использую:
#include <stdio.h>
#include <assert.h>
#include <string.h>
#ifdef __CUDA
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "cusolverDn.h"
#else
#include <complex.h>
#include "cblas.h"
#include "lapacke.h"
#endif /* __CUDA */
int main()
{
/* INPUT MATRIX
*
* 0 1-i 0
* 1+i 0 1-i
* 0 1+i 0
*/
int row = 3;
int col = 3;
#ifdef __CUDA
cuDoubleComplex input[9];
input[0] = make_cuDoubleComplex(0.0, 0.0);
input[1] = make_cuDoubleComplex(1.0, -1.0);
input[2] = make_cuDoubleComplex(0.0, 0.0);
input[3] = make_cuDoubleComplex(1.0, 1.0);
input[4] = make_cuDoubleComplex(0.0, 0.0);
input[5] = make_cuDoubleComplex(1.0, -1.0);
input[6] = make_cuDoubleComplex(0.0, 0.0);
input[7] = make_cuDoubleComplex(1.0, 1.0);
input[8] = make_cuDoubleComplex(0.0, 0.0);
cublasHandle_t blasHandle;
cublasCreate(&blasHandle); // initialize CUBLAS context
cusolverDnHandle_t solverHandle;
cusolverStatus_t statusSolver = cusolverDnCreate(&solverHandle);
assert(CUSOLVER_STATUS_SUCCESS == statusSolver);
const cuDoubleComplex cmplx_1 = make_cuDoubleComplex(1.0, 0.0);
const cuDoubleComplex cmplx_0 = make_cuDoubleComplex(0.0, 0.0);
//input host/device matrix
cuDoubleComplex *inputMatrix;
cudaError_t cudaStat = cudaMallocManaged((void**)&inputMatrix, sizeof(cuDoubleComplex)*row*col, cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
//ROW MAJOR TO COL MAJOR
cuDoubleComplex * inputMatrixClone;
cudaMallocManaged((void**)&inputMatrixClone, sizeof(cuDoubleComplex)*row*col, cudaMemAttachGlobal);
memcpy(inputMatrixClone, input, sizeof(cuDoubleComplex)*row*col);
cublasStatus_t status = cublasZgeam(blasHandle, CUBLAS_OP_T, CUBLAS_OP_N, row, col, &cmplx_1, inputMatrixClone, row, &cmplx_0, inputMatrixClone, row, inputMatrix, row );
assert(CUBLAS_STATUS_SUCCESS == status);
cudaFree(inputMatrixClone);
// done row major to col major conversion
int lwork = 0;
int *devInfo = NULL;
double* eig;
cuDoubleComplex *d_work = NULL;
cudaStat = cudaMallocManaged((void**)&eig, sizeof(double)*row, cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
cudaStat = cudaMallocManaged((void**)&devInfo, sizeof(int), cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
cusolverStatus_t cusolver_status = cusolverDnZheevd_bufferSize(solverHandle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, row, inputMatrix, col, eig, &lwork);
assert (cusolver_status == CUSOLVER_STATUS_SUCCESS);
cudaStat = cudaMallocManaged((void**)&d_work, sizeof(cuDoubleComplex)*lwork, cudaMemAttachGlobal);
assert(cudaSuccess == cudaStat);
cusolver_status = cusolverDnZheevd(solverHandle, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, row, inputMatrix, col, eig, d_work, lwork, devInfo);
cudaDeviceSynchronize();
assert(CUSOLVER_STATUS_SUCCESS == cusolver_status);
FILE *fp;
fp = fopen("with_cuda.txt", "w+");
//eigenvalues
for (int i=0; i<row; i++)
fprintf(fp, "eig[%d] = %0.16f\n", i, eig[i]);
//eigenvectors
for(int i=0; i<row*col; i++)
fprintf(fp, "eiv[%d] = %0.16f, %0.16f\n", i, cuCreal(inputMatrix[i]), cuCimag(inputMatrix[i]));
fclose(fp);
cudaFree(inputMatrix);
cudaFree(eig);
cudaFree(devInfo);
cudaFree(d_work);
cusolverDnDestroy(solverHandle);
cublasDestroy(blasHandle); // destroy CUBLAS context
cudaDeviceReset();
#else
double complex input[9];
input[0] = 0;
input[1] = 1 - _Complex_I;
input[2] = 0;
input[3] = 1 + _Complex_I;
input[4] = 0;
input[5] = 1 - _Complex_I;
input[6] = 0;
input[7] = 1 + _Complex_I;
input[8] = 0;
double eig[3];
LAPACKE_zheev(LAPACK_ROW_MAJOR, 'V', 'U', row, input, col, eig);
//'V': Compute eigenvalues and eigenvectors. 'U': Upper triangle of input;
FILE *fp;
fp = fopen("without_cuda.txt", "w+");
//eigenvalues
for (int i=0; i<row; i++)
fprintf(fp, "eig[%d] = %0.16f\n", i, eig[i]);
//eigenvectors
for(int i=0; i<row*col; i++)
fprintf(fp, "eiv[%d] = %0.16f, %0.16f\n", i, creal(input[i]), cimag(input[i]));
fclose(fp);
#endif
return 0;
}
Программа может быть скомпилирована с использованием #define __CUDA для получения сборки cuda или без #define __CUDA для получения сборки не-cuda. сборка без cuda дает мне следующий вывод:
eig[0] = -2.0000000000000000
eig[1] = -0.0000000000000002
eig[2] = 2.0000000000000000
eiv[0] = 0.0000000000000001, 0.5000000000000000
eiv[1] = 0.0000000000000002, 0.7071067811865476
eiv[2] = -0.0000000000000001, -0.5000000000000000
eiv[3] = 0.4999999999999998, -0.4999999999999999
eiv[4] = 0.0000000000000000, -0.0000000000000001
eiv[5] = 0.4999999999999998, -0.4999999999999999
eiv[6] = -0.4999999999999999, 0.0000000000000000
eiv[7] = 0.7071067811865475, 0.0000000000000000
eiv[8] = 0.4999999999999999, 0.0000000000000000
сборка cuda дает мне следующий вывод:
eig[0] = -2.0000000000000000
eig[1] = 0.0000000000000002
eig[2] = 2.0000000000000000
eiv[0] = 0.0000000000000001, 0.5000000000000000
eiv[1] = 0.4999999999999997, -0.4999999999999998
eiv[2] = -0.4999999999999999, 0.0000000000000000
eiv[3] = 0.0000000000000001, 0.7071067811865476
eiv[4] = -0.0000000000000000, 0.0000000000000000
eiv[5] = 0.7071067811865475, 0.0000000000000000
eiv[6] = 0.0000000000000001, 0.5000000000000000
eiv[7] = -0.4999999999999998, 0.4999999999999999
eiv[8] = -0.4999999999999999, 0.0000000000000000
Кто-нибудь может пролить свет на этот вопрос, пожалуйста, почему я получаю другой результат, в основном собственные векторы? Порядок собственных значений также кажется обратным. Почему это?