CUSPARSE решает трехдиагональный `cusparseDgtsv` медленно - PullRequest
2 голосов
/ 23 мая 2019

Я решаю систему линейных уравнений с выделенным решателем cusparseDgtsv() из библиотеки CUSPARSE и не вижу ускорения.Я попытался запустить тесты на:

  • Устройство 0: "Tesla K40s"
  • Версия драйвера CUDA / Runtime Version 9.2 / 9.1
  • CUDA Capability Major / Minor version number: 3.5
  • Itel Xeon E5-2697 v3 2.60 ГГц

Следующий тестовый код, который я компилирую с nvcc -lcusparse main.cu -o dgtsv.app -gencode arch=compute_35,code=sm_35 для Tesla K40s .

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <limits.h>
#include <cuda_runtime.h>
#include <cusparse.h>

void run_Dgtsv_test(int n);
void run_Managed_Dgtsv_test(int n);

int tridiagonalmatrixalgorithm(double* a, double* b, double* c, double* d, double** X, int size);

int main(int argc, char** argv) {

        int base = 2;
    int max_size = (INT_MAX/16)/(sizeof(double));

    printf("Dgtsv speed test\n SLAE size, Dgtsv time, Serial \n");
    for (int n = 512; n < max_size; n*=base)
        run_Dgtsv_test(n);

    printf("####################### Testing MANAGED Malloc ###########################\n");

    printf("Managed malloc Dgtsv speed test\n SLAE size, Dgtsv time, Serial \n");
    for (int n = 512; n < max_size; n*=base)
        run_Managed_Dgtsv_test(n);


}

/**
  * run cuSPARSE solver for tridiagonal matrix
  */
void run_Dgtsv_test(int n) {
    // {{{
    cusparseHandle_t cusparseHandle;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);

    double* a_dev;
    double* b_dev;
    double* c_dev;
    double* d_dev;

    double* a;
    double* b;
    double* c;
    double* d;
    double* tmp;

    int num_of_tests = 50;

    a = new double[n]; // lower diagonal
    b = new double[n]; // main diagonal
    c = new double[n]; // upper diagonal
    d = new double[n]; // right handside part
    tmp = new double[n];

    for (int i=0; i < n; i++) {
        a[i] = 0;
        b[i] = 0;
        c[i] = 0;
        d[i] = 0;

        tmp[i] = i;
    }

    for (int i=0; i < n; i++) {
        b[i] = 4*i;
        d[i] = 8*i;
    }
    for (int i=1; i < n-1; i++) {
      a[i] = i;
      c[i] = 2*i;
    }

    size_t size = sizeof(double) * n;
    (cudaMalloc( (void**)& a_dev , size) );
    (cudaMalloc( (void**)& b_dev , size) );
    (cudaMalloc( (void**)& c_dev , size) );
    (cudaMalloc( (void**)& d_dev , size) );

    cudaMemcpy( a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy( b_dev, b, size, cudaMemcpyHostToDevice);
    cudaMemcpy( c_dev, c, size, cudaMemcpyHostToDevice);
    cudaMemcpy( d_dev, d, size, cudaMemcpyHostToDevice);



    /* ------------------------------ cuSPARSE TEST ---------------------------------------------- */
    clock_t t1 = clock();
    for (int i=0; i < num_of_tests; i++) {
        cusparseStatus = cusparseDgtsv(cusparseHandle, n, 1, a_dev,b_dev,c_dev,d_dev, n);
        if ( cusparseStatus != CUSPARSE_STATUS_SUCCESS ) {
            fprintf(stderr,"Failed to perform CUSPARSE Dgtsv: int Dgtsv test\n");
            throw;
        }
        cudaDeviceSynchronize();
    }
    clock_t t2 = clock();
    /* ----------------------------------- serial test ------------------------------------------- */
    for (int i=0; i < num_of_tests; i++) {
        tridiagonalmatrixalgorithm(a,b,c,d,&tmp,n);
    }
    clock_t t3 = clock();

    double time1 = ((double) (t2 - t1)) / CLOCKS_PER_SEC / num_of_tests;
    double time2 = ((double) (t3 - t2)) / CLOCKS_PER_SEC / num_of_tests;

    printf("(%9.d, %9.6f, %9.6f) \n",n,time1,time2);

    (cudaFree( a_dev ) );
    (cudaFree( b_dev ) );
    (cudaFree( c_dev ) );
    (cudaFree( d_dev ) );
    delete[] (a);
    delete[] (b);
    delete[] (c);
    delete[] (d);
    cusparseDestroy(cusparseHandle);
}
// }}}

/**
 * run cuSPARSE solver for tridiagonal matrix
 */
void run_Managed_Dgtsv_test(int n) {
    // {{{
    cusparseHandle_t cusparseHandle;
    cusparseStatus_t cusparseStatus;
    cusparseStatus = cusparseCreate(&cusparseHandle);

    double* a_dev;
    double* b_dev;
    double* c_dev;
    double* d_dev;

    double* a;
    double* b;
    double* c;
    double* d;
    double* tmp;

    int num_of_tests = 50;

    a = new double[n];
    b = new double[n];
    c = new double[n];
    d = new double[n];
    tmp = new double[n];

    for (int i=0; i < n; i++) {
        a[i] = 0;
        b[i] = 0;
        c[i] = 0;
        d[i] = 0;

        tmp[i] = i;
    }

    for (int i=0; i < n; i++) {
        b[i] = 4*i;
        d[i] = 8*i;
    }
    for (int i=1; i < n-1; i++) {
      a[i] = i;
      c[i] = 2*i;
    }

    size_t size = sizeof(double) * n;
    (cudaMallocManaged( (void**)& a_dev , size) );
    (cudaMallocManaged( (void**)& b_dev , size) );
    (cudaMallocManaged( (void**)& c_dev , size) );
    (cudaMallocManaged( (void**)& d_dev , size) );

    cudaMemcpy( (void **) &a_dev, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy( (void **) &b_dev, b, size, cudaMemcpyHostToDevice);
    cudaMemcpy( (void **) &c_dev, c, size, cudaMemcpyHostToDevice);
    cudaMemcpy( (void **) &d_dev, d, size, cudaMemcpyHostToDevice);



    /* ------------------------------ cuSPARSE TEST ---------------------------------------------- */
    clock_t t1 = clock();
    for (int i=0; i < num_of_tests; i++) {
        cusparseStatus = cusparseDgtsv(cusparseHandle, n, 1, a_dev,b_dev,c_dev,d_dev, n);
        if ( cusparseStatus != CUSPARSE_STATUS_SUCCESS ) {
            fprintf(stderr,"Failed to perform CUSPARSE Dgtsv: int Dgtsv test\n");
            throw;
        }
        cudaDeviceSynchronize();
    }
    clock_t t2 = clock();
    /* ----------------------------------- serial test ------------------------------------------- */
    for (int i=0; i < num_of_tests; i++) {
        tridiagonalmatrixalgorithm(a,b,c,d,&tmp,n);
    }
    clock_t t3 = clock();


    double time1 = ((double) (t2 - t1)) / CLOCKS_PER_SEC / num_of_tests;
    double time2 = ((double) (t3 - t2)) / CLOCKS_PER_SEC / num_of_tests;

    printf("(%9.d, %9.6f, %9.6f) \n",n,time1,time2);

    (cudaFree( a_dev ) );
    (cudaFree( b_dev ) );
    (cudaFree( c_dev ) );
    (cudaFree( d_dev ) );
    delete[] (a);
    delete[] (b);
    delete[] (c);
    delete[] (d);
    cusparseDestroy(cusparseHandle);
}
// }}}


/** Serial tridiagonal solver
 *
 * Tridiagonal algorithm, same array size for all input vars.
 * @var X, @var a, @var b, @var  should be size of @var n
 * **************************************************************************************************
 * **** Warning! b[0] and c[n-1] is a trash element, but they should be for array size constancy ****
 * **************************************************************************************************
 * @input double**              X       -- pointer to array where the answer will be written
 *
 * ⬇︎ b[0]
 *      | a c 0 0 0 0 ... 0 0 0 | | x | = | d |
 *      | b a c 0 0 0 ... 0 0 0 | | x | = | d |
 *      | 0 b a c 0 0 ... 0 0 0 | | x | = | d |
 *      | 0 0 b a c 0 ... 0 0 0 | | x | = | d |
 *      | :                   : | | x | = | d |
 *      | :                   : | | x | = | d |
 *      | 0 ...           b a c | | x | = | d |
 *      | 0 ...           0 b a | | x | = | d |
 *                                                               ⬆︎ c[n-1]
 */

int tridiagonalmatrixalgorithm(double* a, double* b, double* c, double* d, double** X, int size) {
    // {{{

    double* x = (double*) calloc(size,sizeof(double));
    double* v = (double*) calloc(size,sizeof(double));
    double w;
    w = a[0];
    x[0] = d[0]/w;
    for (int i=1; i<size; i++) {
        v[i-1] = c[i-1]/w;
        w = a[i] - b[i]*v[i-1];
        x[i] = (d[i] - b[i]*x[i-1])/w;
    }

    for (int j=size-2; j >= 0; j--)
      x[j] = x[j] - v[j] * x[j+1];

    for (int i=0; i < size; i++)
      (*X)[i] = x[i];
    free(v);
    free(x);
    return 0;
}
// }}}

Это дает мне следующий вывод:

Dgtsv speed test
 SLAE size, Dgtsv time, Serial
(      512,  0.000600,  0.000200)
(     1024,  0.001000,  0.000000)
(     2048,  0.000800,  0.000200)
(     4096,  0.001000,  0.000200)
(     8192,  0.000800,  0.000200)
(    16384,  0.001000,  0.000200)
(    32768,  0.001600,  0.000600)
(    65536,  0.001800,  0.001400)
(   131072,  0.002000,  0.002600)
(   262144,  0.003000,  0.005400)
(   524288,  0.005000,  0.011600)
(  1048576,  0.008400,  0.023000)
(  2097152,  0.015800,  0.045400)
(  4194304,  0.030400,  0.090200)
(  8388608,  0.059600,  0.192800)
####################### Testing MANAGED Malloc ###########################
Managed malloc Dgtsv speed test
 SLAE size, Dgtsv time, Serial
(      512,  0.001000,  0.000000)
(     1024,  0.001400,  0.000000)
(     2048,  0.001200,  0.000000)
(     4096,  0.001400,  0.000000)
(     8192,  0.001200,  0.000200)
(    16384,  0.001200,  0.000400)
(    32768,  0.001400,  0.000600)
(    65536,  0.001800,  0.001400)
(   131072,  0.002400,  0.002600)
(   262144,  0.003000,  0.005600)
(   524288,  0.004800,  0.011000)
(  1048576,  0.008600,  0.022000)
(  2097152,  0.015800,  0.045800)
(  4194304,  0.030600,  0.091200)
(  8388608,  0.059800,  0.179600)

Версии компилятора

$ nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2016 NVIDIA Corporation
Built on Tue_Jan_10_13:22:03_CST_2017
Cuda compilation tools, release 8.0, V8.0.61


$ g++ --version
g++ (GCC) 4.8.5 20150623 (Red Hat 4.8.5-4)
Copyright (C) 2015 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



Чего мне не хватает?Спасибо!

...