Numpy решение
import numpy as np
sensor_points=np.random.randn(1002001,3)#.astype(np.float32)
rayorigins=np.random.rand(36,3)#.astype(np.float32)
sp_min_ro = sensor_points[:, np.newaxis, :] - rayorigins
norm=np.linalg.norm(sp_min_ro,axis=2)
sp_min_ro/=norm[:,:,np.newaxis]
Задержка
np.float64: 1.76 s ± 26.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
np.float32: 1.42 s ± 9.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
раствор Numba
import numba as nb
@nb.njit(fastmath=True,error_model="numpy",parallel=True)
def normalized_vec(sensor_points,rayorigins):
res=np.empty((sensor_points.shape[0],rayorigins.shape[0],3),dtype=sensor_points.dtype)
for i in nb.prange(sensor_points.shape[0]):
for j in range(rayorigins.shape[0]):
vec_x=sensor_points[i,0]-rayorigins[j,0]
vec_y=sensor_points[i,1]-rayorigins[j,1]
vec_z=sensor_points[i,2]-rayorigins[j,2]
dist=np.sqrt(vec_x**2+vec_y**2+vec_z**2)
res[i,j,0]=vec_x/dist
res[i,j,1]=vec_y/dist
res[i,j,2]=vec_z/dist
return res
Задержка
%timeit res=normalized_vec(sensor_points,rayorigins)
np.float64: 208 ms ± 4.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
np.float32: 104 ms ± 515 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Решение Numba с предварительно выделенной памятью
Распределение памяти может быть очень дорогостоящим. Этот пример должен показать, почему иногда лучше избегать больших временных массивов, если это возможно.
@nb.njit(fastmath=True,error_model="numpy",parallel=True)
def normalized_vec(sensor_points,rayorigins,res):
for i in nb.prange(sensor_points.shape[0]):
for j in range(rayorigins.shape[0]):
vec_x=sensor_points[i,0]-rayorigins[j,0]
vec_y=sensor_points[i,1]-rayorigins[j,1]
vec_z=sensor_points[i,2]-rayorigins[j,2]
dist=np.sqrt(vec_x**2+vec_y**2+vec_z**2)
res[i,j,0]=vec_x/dist
res[i,j,1]=vec_y/dist
res[i,j,2]=vec_z/dist
return res
Задержка
res=np.empty((sensor_points.shape[0],rayorigins.shape[0],3),dtype=sensor_points.dtype)
%timeit res=normalized_vec(sensor_points,rayorigins)
np.float64: 66.6 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
np.float32: 33.8 ms ± 375 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)