Я пытаюсь изучить CUDA, и ниже приведены мои файлы кода. Есть ожидаемая ошибка выражения, которую я не могу решить. Что здесь не так? На простом примере Hello World я получаю ту же ошибку. Ошибка в строке ниже. Даже когда я копирую код из github, я получаю ту же ошибку при вызове функций угловой скобки.
cudamult << <blocksPerGrid, threadsPerBlock >> > (n, p, m, A, B, C);
main.cpp
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <iostream>
#include <cuda_runtime.h>
#include<mycuda.cu>
using namespace std;
//extern void use_saxpy_cuda(int n, float a, float * x, float * y);
extern void use_cudamult(int n, int p, int m, float* A, float* B, float* C);
void matmult(int n, int p, int m, float* A, float* B, float* C)
{
use_cudamult(n, p, m, A, B, C);
}
int main()
{
int n = 3;
int p = 4;
int m = 5;
float * A;
float * B;
float * C;
// Allocate memory
cudaMallocManaged(&A, n*p*sizeof(float));
cudaMallocManaged(&B, p*m * sizeof(float));
cudaMallocManaged(&C, n*m * sizeof(float));
for (int i = 0; i < n*p; i++) {
A[i] = 1.0f;
}
for (int i = 0; i < p*m; i++) {
B[i] = 1.0f;
}
cout << "A: ";
for (int i = 0; i < n*p; i++) {
if (i%p == 0)
cout << endl;
cout << A[i] << " ";
}
cout << endl;
cout << "B: ";
for (int i = 0; i < p*m; i++) {
if (i%m == 0)
cout << endl;
cout << B[i] << " ";
}
cout << endl;
matmult(n, p , m, A, B, C);
cudaDeviceSynchronize();
cout << "C: ";
for (int i = 0; i < n*m; i++) {
if (i%m == 0 )
cout << endl;
cout << C[i] << " ";
}
cudaFree(A);
cudaFree(B);
cudaFree(C);
}
mycuda.cu
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#ifndef __CUDACC__
#define __CUDACC__
#endif
#include <device_functions.h>
#include <iostream>
#include <chrono>
#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>
#include <algorithm>
using namespace std;
__global__ void cudamult(int n, int p, int m, float* A, float* B, float* C)
{
//int index = threadIdx.x;
int ROW = blockIdx.y*blockDim.y + threadIdx.y;
int COL = blockIdx.x*blockDim.x + threadIdx.x;
float tmpSum = 0;
if (ROW > n || COL > m)
return;
// each thread computes one element of the block sub-matrix
for (int i = 0; i < p; ++i) {
tmpSum += A[ROW * p + i] * B[i * m + COL];
}
printf("ROW:%d COL:%d sum: %d \n", ROW, COL, tmpSum); // DEBUG
C[ROW * m + COL] = tmpSum;
}
extern "C"
void use_cudamult(int n, int p, int m, float* A, float* B, float* C)
{
//int threadsPerBlock = 512;
//int numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;
int numThreads = max(max(n, p), m);
dim3 threadsPerBlock(numThreads, numThreads);
dim3 blocksPerGrid(1, 1);
//threadsPerBlock.x = 512;
//threadsPerBlock.y = 512;
//blocksPerGrid.x = ceil(double(N) / double(threadsPerBlock.x));
//blocksPerGrid.y = ceil(double(N) / double(threadsPerBlock.y));
cudamult << <blocksPerGrid, threadsPerBlock >> > (n, p, m, A, B, C);
cudaDeviceSynchronize();
}
cuda_t.pro
QT -= gui
CONFIG += c++14 console
CONFIG -= app_bundle
# The following define makes your compiler emit warnings if you use
# any feature of Qt which as been marked deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
# deprecated API in order to know how to port your code away from it.
DEFINES += QT_DEPRECATED_WARNINGS
# You can also make your code fail to compile if you use deprecated APIs.
# In order to do so, uncomment the following line.
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000 # disables all the APIs deprecated before Qt 6.0.0
SOURCES += main.cpp
# This makes the .cu files appear in your project
OTHER_FILES += ./mycuda.cu
CUDA_DIR = /usr/local/cuda-10.1
CUDA_ARCH = sm_32 # as supported by the Tegra K1
INCLUDEPATH += $$CUDA_DIR/include
#LIBS += -L $$CUDA_DIR/lib64 -lcudart -lcuda
#osx: LIBS += -F/Library/Frameworks -framework CUDA
cuda.commands = $$CUDA_DIR/bin/nvcc -c -arch=$$CUDA_ARCH -o ${QMAKE_FILE_OUT} ${QMAKE_FILE_NAME}
cuda.dependency_type = TYPE_C
cuda.depend_command = $$CUDA_DIR/bin/nvcc -M ${QMAKE_FILE_NAME}
cuda.input = CUDA_SOURCES
cuda.output = ${QMAKE_FILE_BASE}_cuda.o
QMAKE_EXTRA_COMPILERS += cuda
DISTFILES +=
HEADERS += \
timer.h \
matrixmul.h \
hip_matrix_mul.h \
cu_matrix_mul.h