Использование инструментов из сторонней библиотеки more_itertools
:
Дано
import itertools as it
import collections as ct
import more_itertools as mit
data = [
"Ragu ate lunch but didnt have Water for drinks",
"Rams ate lunch but didnt have Gatorade for drinks",
"Saya ate lunch but didnt have :water for drinks",
"Raghu ate lunch but didnt have water for drinks",
"Hanu ate lunch but didnt have -water for drinks",
"Wayu ate lunch but didnt have water for drinks",
"Viru ate lunch but didnt have .water 4or drinks",
"kk ate lunch & icecream but did have Water for drinks",
"M ate lunch &and icecream but did have Gatorade for drinks",
"Parker ate lunch icecream but didnt have :water for drinks",
"Sassy ate lunch and icecream but didnt have water for drinks",
"John ate lunch and icecream but didnt have -water for drinks",
"Pokey ate lunch and icecream but didnt have Water for drinks",
"Laila ate lunch and icecream but did have water 4or drinks",
]
Код
ngrams = []
for sentence in data:
words = sentence.split()
for n in range(3, len(words)+1):
ngrams.extend((list(mit.windowed(words, n))))
counts = ct.Counter(ngrams)
dict(counts.most_common(5))
Выход
{('but', 'didnt', 'have'): 11,
('ate', 'lunch', 'but'): 7,
('lunch', 'but', 'didnt'): 7,
('ate', 'lunch', 'but', 'didnt'): 7,
('lunch', 'but', 'didnt', 'have'): 7}
В качестве альтернативы
sentences = [sentence.split() for sentence in data]
ngrams = mit.flatten(list(mit.windowed(w, n)) for n in range(3, len(sentences)+1) for w in sentences)
counts = ct.Counter(ngrams)
dict(counts.most_common(5))