c_stride = 0 в CLBlastSgemmStridedBatched () в CLBLast получить неправильные результаты? - PullRequest
0 голосов
/ 04 января 2019

Эта функция CLBlastSgemmStridedBatched () в CLBlast https://github.com/CNugteren/CLBlast выполняет умножение матрицы на матрицу пакета матриц, c_stride обозначает (фиксированный) шаг между двумя пакетами матрицы C и определяет местоположения выходной матрицы C Таким образом, эту функцию выполняют

for(int i=0; i< batch_count; i++){     
C + i * c_stride = αlpha* ( A + i * a_stride ) *( B + i * b_stride ) + β* ( C + i * c_stride )      
}     

Когда c_stride = 0, они получают разные результаты на двух разных устройствах. По контракту, когда c_stride = M * N, они получают одинаковые результаты на двух устройствах. Поддерживает ли CLBlastSgemmStridedBatched () CLBLast c_stride = 0 или нет?

#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include "math.h"
#include <clblast_c.h>

cl_context ctx;  
cl_command_queue queue;  

double what_time_is_it_now()  
{  
    struct timeval time;   
    if (gettimeofday(&time, NULL)) {   
        return 0;   
    }   
    return (double)time.tv_sec + (double)time.tv_usec * .000001;   
}   

void init_CL(){  
    cl_int err;   
    cl_platform_id platform = 0;   
    cl_device_id device = 0;   
    cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0 };   

    /* Setup OpenCL environment. */
    err = clGetPlatformIDs(1, &platform, NULL);
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);

    props[1] = (cl_context_properties) platform;
    ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
    queue = clCreateCommandQueue(ctx, device, 0, &err);

}

double clSBatchedSgemm(int M, int N, int K, int batch_count, int a_stride){

    int i,j,b;

// Allocate host storage for batch_count A,B,C matrices
float* A = malloc(sizeof(float) * M * K * batch_count);
float* B = malloc(sizeof(float) * K * N * batch_count);
float* C = malloc(sizeof(float) * M * N * batch_count);
for(b=0; b<batch_count; b++) {
    for(j=0; j<M; j++) {
        for(i=0; i<K; i++) {
            int index = i*M + j + b*M*K;
            A[index] = index*index + 0.0f;
            B[index] = index + 1.0f;
            C[index] = 0.0f;
        }
    }
}

cl_float alpha = 1;
cl_float beta = 2;
size_t lda = K;
size_t ldb = N;
size_t ldc = N;
const size_t b_stride = N*K, c_stride = M*N;
if(c_stride != M*N){
    printf("[Error!] c_stride must be the size of matrix A, the function CLBlastSgemmStridedBatched does not support that c_stride is zero \n");
}

cl_mem bufA, bufB, bufC;
cl_event event = NULL;
cl_int  err;
/* Prepare OpenCL memory objects and place matrices inside them. */
if(a_stride==M*K){
    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, batch_count*M * K * sizeof(*A),NULL, &err);
    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,batch_count*M * K * sizeof(*A), A, 0, NULL, NULL);
}else{
    bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),NULL, &err);
    err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,M * K * sizeof(*A), A, 0, NULL, NULL);
}
bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, batch_count*K * N * sizeof(*B),NULL, &err);
bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, batch_count*M * N * sizeof(*C),NULL, &err);

err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,batch_count*K * N * sizeof(*B), B, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,batch_count*M * N * sizeof(*C), C, 0, NULL, NULL);

double start_t = what_time_is_it_now();
/* Call clBLAS extended function.*/
CLBlastStatusCode status = CLBlastSgemmStridedBatched(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeNo,
                                         M, N, K,
                                         alpha,
                                         bufA, 0, lda, a_stride,
                                         bufB, 0, ldb, b_stride,
                                         beta,
                                         bufC, 0, ldc, c_stride,
                                         batch_count,
                                         &queue, &event);

/* Wait for calculations to be finished. */
if (status == CLBlastSuccess) {
    err = clWaitForEvents(1, &event);
    clReleaseEvent(event);
}

clFinish(queue);
double btime = what_time_is_it_now()-start_t;

printf("Batch_size: %6d,  batchedGEMMTime: %3.6f ms, time/GEMM: %3.6f \n", batch_count, btime, btime/batch_count);

// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed SGEMM with status %d\n", status);

/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, batch_count*M * N * sizeof(*C), C, 0, NULL, NULL);

/* Release OpenCL memory objects. */
clReleaseMemObject(bufC);
clReleaseMemObject(bufB);
clReleaseMemObject(bufA);

free(A);
free(B);
free(C);

return btime/batch_count;
}

int main(void) {
    init_CL();
    int M=128, N=5184, K=256;
    int batch_Num = 256;
    int strideA = 0;

    double* times = malloc(sizeof(float) * batch_Num);

    for(int batch_index=1;batch_index<batch_Num;batch_index=batch_index*2){
        times[batch_index]= clSBatchedSgemm(M,N,K,batch_index,strideA);
    }

    times[1]= clSBatchedSgemm(M,N,K,1,strideA);

    for(int i=1;i<batch_Num/2-1;i=i*2){
        printf("i=%3d \t i*2=%3d \t %f, \t %f, \t speedup: %.3f, \t speedup/1: %.3f\n", i, i*2, times[i],times[i*2], times[i]/times[i*2], times[1]/times[i*2]);
    }
    clReleaseCommandQueue(queue);
    clReleaseContext(ctx);
    return 0;
}
...