Решение с использованием Numba
- Распараллелено (на очень маленьких выборках, например (24x24) распараллеленная версия будет медленнее из-за накладных расходов на создание потоков)
- Внутренний цикл SIMD-векторизован
Exmaple
import numpy as np
import numba as nb
#Debug output for SIMD-vectorization
import llvmlite.binding as llvm
llvm.set_option('', '--debug-only=loop-vectorize')
########################################
#Your solution
#You can also use Numba on this, but apart from parallization
#it is often better to write out the inner loop
def f_cdist(X1, X2):
res = np.zeros(shape=(X1.shape[0], X2.shape[0]), dtype=np.float64)
for ix1 in range(X1.shape[0]):
for ix2 in range(X2.shape[0]):
res[ix1, ix2] = np.abs(X1[ix1, :] - X2[ix2, :]).sum()
return res
@nb.njit(fastmath=True,parallel=True)
def f_cdist_nb(X1, X2):
#Some safety, becuase there is no bounds-checking
assert X1.shape[1]==X2.shape[1]
res = np.empty(shape=(X1.shape[0], X2.shape[0]), dtype=X1.dtype)
for ix1 in nb.prange(X1.shape[0]):
for ix2 in range(X2.shape[0]):
#Writing out the inner loop often leads to better performance
sum=0.
for i in range(X1.shape[1]):
sum+=np.abs(X1[ix1, i] - X2[ix2, i])
res[ix1, ix2] = sum
return res
Perfomance
from scipy import spatial
#4096x1024
X1=np.random.rand(4096,1024)
X2=np.random.rand(4096,1024)
res1=f_cdist_nb(X1,X2)
res2=f_cdist(X1,X2)
res3=spatial.distance.cdist(X1, X2, 'cityblock')
#Check the results
np.allclose(res1,res2)
True
np.allclose(res1,res3)
True
%timeit res1=f_cdist_nb(X1,X2)
1.38 s ± 64.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=f_cdist(X1,X2)
1min 25s ± 483 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res3=spatial.distance.cdist(X1, X2, 'cityblock')
17.6 s ± 18.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#1024x1024
X1=np.random.rand(1024,1024)
X2=np.random.rand(1024,1024)
%timeit res1=f_cdist_nb(X1,X2)
63.5 ms ± 3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit res3=spatial.distance.cdist(X1, X2, 'cityblock')
1.09 s ± 3.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#512x512
X1=np.random.rand(512,512)
X2=np.random.rand(512,512)
%timeit res1=f_cdist_nb(X1,X2)
4.91 ms ± 280 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit res3=spatial.distance.cdist(X1, X2, 'cityblock')
130 ms ± 150 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Редактировать: оптимизированная для рук версия Numba
#Unroll and Jam loops
@nb.njit(fastmath=True,parallel=True)
def f_cdist_nb_3(X1, X2):
assert X1.shape[1]==X2.shape[1]
res = np.empty(shape=(X1.shape[0], X2.shape[0]), dtype=X1.dtype)
for ix1 in nb.prange(X1.shape[0]//4):
for ix2 in range(X2.shape[0]//4):
sum_1,sum_2,sum_3,sum_4,sum_5,sum_6 =0.,0.,0.,0.,0.,0.
sum_7,sum_8,sum_9,sum_10,sum_11,sum_12=0.,0.,0.,0.,0.,0.
sum_13,sum_14,sum_15,sum_16=0.,0.,0.,0.
for i in range(X1.shape[1]):
sum_1+=np.abs(X1[ix1*4+0, i] - X2[ix2*4+0, i])
sum_2+=np.abs(X1[ix1*4+0, i] - X2[ix2*4+1, i])
sum_3+=np.abs(X1[ix1*4+0, i] - X2[ix2*4+2, i])
sum_4+=np.abs(X1[ix1*4+0, i] - X2[ix2*4+3, i])
sum_5+=np.abs(X1[ix1*4+1, i] - X2[ix2*4+0, i])
sum_6+=np.abs(X1[ix1*4+1, i] - X2[ix2*4+1, i])
sum_7+=np.abs(X1[ix1*4+1, i] - X2[ix2*4+2, i])
sum_8+=np.abs(X1[ix1*4+1, i] - X2[ix2*4+3, i])
sum_9+=np.abs(X1[ix1*4+2, i] - X2[ix2*4+0, i])
sum_10+=np.abs(X1[ix1*4+2, i] - X2[ix2*4+1, i])
sum_11+=np.abs(X1[ix1*4+2, i] - X2[ix2*4+2, i])
sum_12+=np.abs(X1[ix1*4+2, i] - X2[ix2*4+3, i])
sum_13+=np.abs(X1[ix1*4+3, i] - X2[ix2*4+0, i])
sum_14+=np.abs(X1[ix1*4+3, i] - X2[ix2*4+1, i])
sum_15+=np.abs(X1[ix1*4+3, i] - X2[ix2*4+2, i])
sum_16+=np.abs(X1[ix1*4+3, i] - X2[ix2*4+3, i])
res[ix1*4+0, ix2*4+0] = sum_1
res[ix1*4+0, ix2*4+1] = sum_2
res[ix1*4+0, ix2*4+2] = sum_3
res[ix1*4+0, ix2*4+3] = sum_4
res[ix1*4+1, ix2*4+0] = sum_5
res[ix1*4+1, ix2*4+1] = sum_6
res[ix1*4+1, ix2*4+2] = sum_7
res[ix1*4+1, ix2*4+3] = sum_8
res[ix1*4+2, ix2*4+0] = sum_9
res[ix1*4+2, ix2*4+1] = sum_10
res[ix1*4+2, ix2*4+2] = sum_11
res[ix1*4+2, ix2*4+3] = sum_12
res[ix1*4+3, ix2*4+0] = sum_13
res[ix1*4+3, ix2*4+1] = sum_14
res[ix1*4+3, ix2*4+2] = sum_15
res[ix1*4+3, ix2*4+3] = sum_16
#Rest of the loop
for ix1 in range(X1.shape[0]//4*4,X1.shape[0]):
for ix2 in range(X2.shape[0]):
sum_1=0.
for i in range(X1.shape[1]):
sum_1+=np.abs(X1[ix1, i] - X2[ix2, i])
res[ix1, ix2] = sum_1
for ix1 in range(X1.shape[0]):
for ix2 in range(X2.shape[0]//4*4,X2.shape[0]):
sum_1=0.
for i in range(X1.shape[1]):
sum_1+=np.abs(X1[ix1, i] - X2[ix2, i])
res[ix1, ix2] = sum_1
return res
Задержка
#4096x1024
X1=np.random.rand(4096,1024)
X2=np.random.rand(4096,1024)
res1=f_cdist_nb(X1,X2)
res2=f_cdist_nb_3(X1,X2)
res3=spatial.distance.cdist(X1, X2, 'cityblock')
#Check the results
print(np.allclose(res1,res2))
print(np.allclose(res1,res3))
%timeit res1=f_cdist_nb(X1,X2)
1.6 s ± 199 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=f_cdist_nb_3(X1,X2)
497 ms ± 50.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res3=spatial.distance.cdist(X1, X2, 'cityblock')
17.7 s ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)