Сципи самый быстрый!
У меня есть несколько тестов с кодом выше, а также с версией, которую я нашел на моем компе, результаты и код см. Ниже:
pearson 14.7597990757
sim_pearson 15.6806837987
scipy:pearsonr 0.451986019188
try:
import psyco
psyco.full()
except ImportError:
pass
from math import sqrt
def sim_pearson(set1, set2):
si={}
for item in set1:
if item in set2:
si[item] = 1
#number of elements
n = len(si)
#if none common, return 0 similarity
if n == 0: return 0
#add up all the preferences
sum1 = sum([set1[item] for item in si])
sum2 = sum([set2[item] for item in si])
#sum up the squares
sum_sq1 = sum([pow(set1[item], 2) for item in si])
sum_sq2 = sum([pow(set2[item], 2) for item in si])
#sum up the products
sum_p = sum([set1[item] * set2[item] for item in si])
nom = sum_p - ((sum1 * sum2) / n )
den = sqrt( (sum_sq1 - (sum1)**2 / n) * (sum_sq2 - (sum2)**2 / n) )
if den==0: return 0
return nom/den
# from /899238/otsenka-shodstva-pirsona-kak-ya-mogu-optimizirovat-eto-dalshe
def pearson(v1, v2):
vs = [(v1[val],v2[val]) for val in v1 if val in v2]
n = len(vs)
if n==0: return 0.0
sum1,sum2,sum1_sq,sum2_sq,p_sum = 0.0, 0.0, 0.0, 0.0, 0.0
for v1,v2 in vs:
sum1+=v1
sum2+=v2
sum1_sq+=v1*v1
sum2_sq+=v2*v2
p_sum+=v1*v2
# Calculate Pearson score
num = p_sum-(sum1*sum2/n)
temp = max((sum1_sq-pow(sum1,2)/n) * (sum2_sq-pow(sum2,2)/n),0)
if temp:
return num / sqrt(temp)
return 1.0
if __name__ == "__main__":
import timeit
tsetup = """
from random import randrange
from __main__ import pearson, sim_pearson
from scipy.stats import pearsonr
v1 = [randrange(0,1000) for x in range(1000)]
v2 = [randrange(0,1000) for x in range(1000)]
#gc.enable()
"""
t1 = timeit.Timer(stmt="pearson(v1,v2)", setup=tsetup)
t2 = timeit.Timer(stmt="sim_pearson(v1,v2)", setup=tsetup)
t3 = timeit.Timer(stmt="pearsonr(v1,v2)", setup=tsetup)
tt = 1000
print 'pearson', t1.timeit(tt)
print 'sim_pearson', t2.timeit(tt)
print 'scipy:pearsonr', t3.timeit(tt)