Эта функция CLBlastSgemmStridedBatched () в CLBlast https://github.com/CNugteren/CLBlast выполняет умножение матрицы на матрицу пакета матриц, c_stride обозначает (фиксированный) шаг между двумя пакетами матрицы C и определяет местоположения выходной матрицы C Таким образом, эту функцию выполняют
for(int i=0; i< batch_count; i++){
C + i * c_stride = αlpha* ( A + i * a_stride ) *( B + i * b_stride ) + β* ( C + i * c_stride )
}
Когда c_stride = 0, они получают разные результаты на двух разных устройствах. По контракту, когда c_stride = M * N, они получают одинаковые результаты на двух устройствах. Поддерживает ли CLBlastSgemmStridedBatched () CLBLast c_stride = 0 или нет?
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include "math.h"
#include <clblast_c.h>
cl_context ctx;
cl_command_queue queue;
double what_time_is_it_now()
{
struct timeval time;
if (gettimeofday(&time, NULL)) {
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * .000001;
}
void init_CL(){
cl_int err;
cl_platform_id platform = 0;
cl_device_id device = 0;
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, 0, 0 };
/* Setup OpenCL environment. */
err = clGetPlatformIDs(1, &platform, NULL);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
props[1] = (cl_context_properties) platform;
ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
queue = clCreateCommandQueue(ctx, device, 0, &err);
}
double clSBatchedSgemm(int M, int N, int K, int batch_count, int a_stride){
int i,j,b;
// Allocate host storage for batch_count A,B,C matrices
float* A = malloc(sizeof(float) * M * K * batch_count);
float* B = malloc(sizeof(float) * K * N * batch_count);
float* C = malloc(sizeof(float) * M * N * batch_count);
for(b=0; b<batch_count; b++) {
for(j=0; j<M; j++) {
for(i=0; i<K; i++) {
int index = i*M + j + b*M*K;
A[index] = index*index + 0.0f;
B[index] = index + 1.0f;
C[index] = 0.0f;
}
}
}
cl_float alpha = 1;
cl_float beta = 2;
size_t lda = K;
size_t ldb = N;
size_t ldc = N;
const size_t b_stride = N*K, c_stride = M*N;
if(c_stride != M*N){
printf("[Error!] c_stride must be the size of matrix A, the function CLBlastSgemmStridedBatched does not support that c_stride is zero \n");
}
cl_mem bufA, bufB, bufC;
cl_event event = NULL;
cl_int err;
/* Prepare OpenCL memory objects and place matrices inside them. */
if(a_stride==M*K){
bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, batch_count*M * K * sizeof(*A),NULL, &err);
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,batch_count*M * K * sizeof(*A), A, 0, NULL, NULL);
}else{
bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),NULL, &err);
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,M * K * sizeof(*A), A, 0, NULL, NULL);
}
bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, batch_count*K * N * sizeof(*B),NULL, &err);
bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, batch_count*M * N * sizeof(*C),NULL, &err);
err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,batch_count*K * N * sizeof(*B), B, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,batch_count*M * N * sizeof(*C), C, 0, NULL, NULL);
double start_t = what_time_is_it_now();
/* Call clBLAS extended function.*/
CLBlastStatusCode status = CLBlastSgemmStridedBatched(CLBlastLayoutRowMajor, CLBlastTransposeNo, CLBlastTransposeNo,
M, N, K,
alpha,
bufA, 0, lda, a_stride,
bufB, 0, ldb, b_stride,
beta,
bufC, 0, ldc, c_stride,
batch_count,
&queue, &event);
/* Wait for calculations to be finished. */
if (status == CLBlastSuccess) {
err = clWaitForEvents(1, &event);
clReleaseEvent(event);
}
clFinish(queue);
double btime = what_time_is_it_now()-start_t;
printf("Batch_size: %6d, batchedGEMMTime: %3.6f ms, time/GEMM: %3.6f \n", batch_count, btime, btime/batch_count);
// Example completed. See "clblast_c.h" for status codes (0 -> success).
printf("Completed SGEMM with status %d\n", status);
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, batch_count*M * N * sizeof(*C), C, 0, NULL, NULL);
/* Release OpenCL memory objects. */
clReleaseMemObject(bufC);
clReleaseMemObject(bufB);
clReleaseMemObject(bufA);
free(A);
free(B);
free(C);
return btime/batch_count;
}
int main(void) {
init_CL();
int M=128, N=5184, K=256;
int batch_Num = 256;
int strideA = 0;
double* times = malloc(sizeof(float) * batch_Num);
for(int batch_index=1;batch_index<batch_Num;batch_index=batch_index*2){
times[batch_index]= clSBatchedSgemm(M,N,K,batch_index,strideA);
}
times[1]= clSBatchedSgemm(M,N,K,1,strideA);
for(int i=1;i<batch_Num/2-1;i=i*2){
printf("i=%3d \t i*2=%3d \t %f, \t %f, \t speedup: %.3f, \t speedup/1: %.3f\n", i, i*2, times[i],times[i*2], times[i]/times[i*2], times[1]/times[i*2]);
}
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
return 0;
}