Это одно решение, использующее itertools.chain
и collections.Counter
:
import pandas as pd
from collections import Counter
from itertools import chain
s = pd.Series(['This is an example #tag1',
'This too is an example #tag1 #tag2',
'Yup, still an example #tag1 #tag1 #tag3'])
tags = s.map(lambda x: {i[1:] for i in x.split() if i.startswith('#')})
res = Counter(chain.from_iterable(tags))
print(res)
Counter({'tag1': 3, 'tag2': 1, 'tag3': 1})
Тест производительности
collections.Counter
в ~ 2 раза быстрее pd.Series.str.extractall
для большой серии:
import pandas as pd
from collections import Counter
from itertools import chain
s = pd.Series(['This is an example #tag1',
'This too is an example #tag1 #tag2',
'Yup, still an example #tag1 #tag1 #tag3'])
def hal(s):
return s.str.extractall(r'(\#\w+)')\
.reset_index(level=0)\
.drop_duplicates()[0]\
.value_counts()
def jp(s):
tags = s.map(lambda x: {i[1:] for i in x.split() if i.startswith('#')})
return Counter(chain.from_iterable(tags))
s = pd.concat([s]*100000, ignore_index=True)
%timeit hal(s) # 2.76 s per loop
%timeit jp(s) # 1.25 s per loop