Я пытаюсь ускорить код на python, используя cuda \ numba. Код работает с большими массивами комплексных, чисел с плавающей точкой и целых чисел. Я включил и версию Python, и версию Numba-Cuda здесь. Версия numba-cuda не компилируется.
Я пытался выполнить вычисление комплексных чисел как отдельных действительных и мнимых чисел, хотя я мог подумать о сложном формате.
def random_choice_noreplace(m,n, axis=-1):
# m, n are the number of rows, cols of output
return np.random.rand(m,n).argsort(axis=axis)
@cuda.jit
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
d_tmp[row, col] = d_data[d_data_index[row, col]]
d_tmp[row, col] =d_tmp[row, col] * d_coef[row, col]
# All threads get to this point ===============================
cuda.syncthreads()
if row == 0 and col ==0 :
d_datasum = np.sum(d_tmp, axis=0)
def calculate_cuda (data, data_index, coef):
npart, npts = data_index.shape
# arrays to copy to GPU memory =====================================
d_npart = cuda.to_device(npart)
d_npts = cuda.to_device(npts)
d_data = cuda.to_device(data)
d_data_index = cuda.to_device(data_index)
d_coef = cuda.to_device(coef)
d_datasum = cuda.device_array(npts, np.complex64)
d_tmp = cuda.device_array((npart,npts), np.complex64)
threadsperblock = (16, 16)
blockspergrid_x = int(math.ceil(npts / threadsperblock[0]))+1
blockspergrid_y = int(math.ceil(npart / threadsperblock[1]))+1
blockspergrid = (blockspergrid_x, blockspergrid_y)
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
# Copy data from GPU to CPU ========================================
final_data_sum = d_datasum.copy_to_host()
return final_data_sum
def calculate_python (data, data_index, coef):
npart, npts = data_index.shape
data_sum = np.zeros(npts, dtype=np.complex64)
tmp = np.zeros(npts, dtype=np.complex64)
print(" Calling python function...")
start_time = time.time()
for i in range(npart):
tmp[:] = data[data_index[i]]
data_sum += tmp * coef[i]
return data_sum
if __name__ == '__main__':
data_size = 1200
rows = 31
cols = 1000
rand_float1 = np.random.randn(data_size)
rand_float2 = np.random.randn(data_size)
data = rand_float1 + 1j * rand_float2
coef = np.random.randn(rows, cols)
data_index = random_choice_noreplace(rows, cols)
start_time = time.time()
gpu_data_sum_python = calculate_python (data, data_index, coef)
python_time = time.time() - start_time #print("gpu c : ", c_gpu)
print("---- %s seconds for python ----:" % (python_time))
start_time = time.time()
gpu_data_sum = calculate_cuda (data, data_index, coef)
gpu_time = time.time() - start_time
print("---- %s seconds for gpu ----:" % (gpu_time))
Когда я запускаю код, я получаю эту ошибку:
Calling python function...
---- 0.000344038009644 seconds for python ----:
Traceback (most recent call last):
File "GPU_Fake_PA_partial.py", line 82, in <module>
gpu_data_sum = calculate_cuda (data, data_index, coef)
File "GPU_Fake_PA_partial.py", line 44, in calculate_cuda
cuda_kernel[blockspergrid, threadsperblock](d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 765, in __call__
kernel = self.specialize(*args)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 776, in specialize
kernel = self.compile(argtypes)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 792, in compile
**self.targetoptions)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 62, in compile_kernel
cres = compile_cuda(pyfunc, types.void, args, debug=debug, inline=inline)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/cuda/compiler.py", line 51, in compile_cuda
locals={})
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 926, in compile_extra
return pipeline.compile_extra(func)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 374, in compile_extra
return self._compile_bytecode()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 857, in _compile_bytecode
return self._compile_core()
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 844, in _compile_core
res = pm.run(self.status)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler_lock.py", line 32, in _acquire_compile_lock
return func(*args, **kwargs)
File "/disk/home/ajooya/software/venv/lib/python2.7/site-packages/numba/compiler.py", line 255, in run
raise patched_exception
numba.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Invalid use of Function(<built-in function lt>) with argument(s) of type(s): (int32, array(int64, 1d, A))
Known signatures:
* (bool, bool) -> bool
* (int8, int8) -> bool
* (int16, int16) -> bool
* (int32, int32) -> bool
* (int64, int64) -> bool
* (uint8, uint8) -> bool
* (uint16, uint16) -> bool
* (uint32, uint32) -> bool
* (uint64, uint64) -> bool
* (float32, float32) -> bool
* (float64, float64) -> bool
* parameterized
In definition 0:
All templates rejected with literals.
In definition 1:
All templates rejected without literals.
In definition 2:
All templates rejected with literals.
In definition 3:
All templates rejected without literals.
In definition 4:
All templates rejected with literals.
In definition 5:
All templates rejected without literals.
In definition 6:
All templates rejected with literals.
In definition 7:
All templates rejected without literals.
This error is usually caused by passing an argument of a type that is unsupported by the named function.
[1] During: typing of intrinsic-call at GPU_Fake_PA_partial.py (15)
File "GPU_Fake_PA_partial.py", line 15:
def cuda_kernel (d_npart, d_npts, d_data, d_data_index, d_coef, d_datasum, d_tmp):
<source elided>
row, col = cuda.grid(2)
if row < d_npart and col < d_npts :
Любая помощь высоко ценится.