Данные
n = 10e3
ints = pd.DataFrame({'column_a': [1, 2, 3, 4, 5] * n,
'column_b': [2, 10, 9, 3, 8] * n})
strings = pd.DataFrame({'column_a': ['a', 'b', 'c', 'd', 'e'] * n,
'column_b': ['b', 'x', 'y', 'c', 'z'] * n})
Методы
def using_isin(df): # @timgeb
return df['column_a'][df['column_a'].isin(df['column_b'])]
def using_isin_loc_tolist(df): # @SandeepKadapa
return df.loc[df['column_a'].isin(df['column_b'].tolist()),'column_a']
def using_melt_groupby(df): # @W-B
return df.melt().groupby('value').variable.nunique().loc[lambda x : x>1].index
def using_set_intersection(df): # @GergesDib, @TBurgins
return set(df['column_a']).intersection(set(df['column_b']))
def using_set_intersection_map(df): # @piRSquared
return set.intersection(*map(set, map(df.get, df)))
def using_reduce_np_intersect(df): # @JonClements
return reduce(np.intersect1d, df.values.T)
def using_np_any(df): # @W-B
return df.column_a[np.any(df['column_a'].values == df['column_b'].values[:, None], 0)]
Производительность, если столбцы содержат целые числа
%timeit -n 10 using_isin(ints)
977 µs ± 164 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_reduce_np_intersect(ints)
1.31 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_set_intersection(ints)
1.54 ms ± 68.6 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_set_intersection_map(ints)
1.59 ms ± 59.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_isin_loc_tolist(ints)
2.39 ms ± 921 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_melt_groupby(ints)
34.2 ms ± 988 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_np_any(ints)
4.35 s ± 148 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Производительность, если столбцы содержат строки
%timeit -n 10 using_set_intersection_map(strings)
1.16 ms ± 35.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_intersection_set(strings)
1.2 ms ± 71.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_isin(strings)
1.69 ms ± 310 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_isin_loc_tolist(strings)
2.15 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_melt_groupby(strings)
35.6 ms ± 1.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_reduce_np_intersect(strings)
43 ms ± 1.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit -n 10 using_np_any(strings)
# too slow to count