Не уверен, есть ли способ обойти хотя бы один DataFrame
, но вот один из вариантов, который может ускорить процесс.Он допускает случайное сравнение FirstName с LastName, хотя этого можно избежать, добавив уникальный префикс к значениям (например, «@» для имени и «&» для фамилии)
import numpy as np
s1 = [set(x) for x in df1.values]
s2 = [set(x) for x in df2.values]
masks = np.reshape([len(x & y) >= 3 for x in s1 for y in s2], (len(df1), -1))
concat_all = [df2[m] for m in masks]
Выход concat_all
[ FirstName LastName Birthday ResidenceZip
0 John Doe 1/1/2000 99999
1 John Doe 1/1/2000 99999
2 John Doex 1/1/2000 99999,
FirstName LastName Birthday ResidenceZip
5 Rob A 9/9/2009 19499]
Время
def Alollz(df1, df2):
s1 = [set(x) for x in df1.values]
s2 = [set(x) for x in df2.values]
masks = np.reshape([len(x & y) >= 3 for x in s1 for y in s2], (len(df1), -1))
concat_all = [df2[m] for m in masks]
return concat_all
def SharpObject(df1, df2):
concat_all = []
for i, row in df1.iterrows():
c = {'ResidenceZip': row['ResidenceZip'], 'FirstName':row['FirstName'],
'LastName': row['LastName'],'Birthday': row['Birthday']}
df2['count'] = df2.apply(lambda x: partialMatch(x, c), axis = 1)
x1 = df2[df2['count']>=3]
concat_all.append(x1)
return concat_all
%timeit Alollz(df1, df2)
#785 µs ± 5.26 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit SharpObject(df1, df2)
#3.56 ms ± 44.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
И больше:
# you should never append dfs like this in a loop
for i in range(7):
df1 = df1.append(df1)
df2 = df2.append(df2)
%timeit Alollz(df1, df2)
#132 ms ± 248 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit SharpObject(df1, df2)
#6.88 s ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)