TL; DR
Используйте ngrams
или everygrams
:
>>> from itertools import chain
>>> import pandas as pd
>>> from nltk import word_tokenize
>>> from nltk import FreqDist
>>> df = pd.read_csv('x')
>>> df['Description']
0 Here is a sentence.
1 This is a foo bar sentence.
Name: Description, dtype: object
>>> df['Description'].map(word_tokenize)
0 [Here, is, a, sentence, .]
1 [This, is, a, foo, bar, sentence, .]
Name: Description, dtype: object
>>> sents = df['Description'].map(word_tokenize).tolist()
>>> FreqDist(list(chain(*[everygrams(sent, 1, 3) for sent in sents])))
FreqDist({('sentence',): 2, ('is', 'a'): 2, ('sentence', '.'): 2, ('is',): 2, ('.',): 2, ('a',): 2, ('Here', 'is', 'a'): 1, ('a', 'foo'): 1, ('a', 'sentence'): 1, ('bar', 'sentence', '.'): 1, ...})