Вы можете перезаписать аргумент analyzer
в CounterVectorizer
, например
from functools import partial
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.util import skipgrams
skip4_bigrams = partial(skipgrams, n=2, k=4)
def my_analyzer(text):
return skip4_bigrams(word_tokenize(text))
v = CountVectorizer(analyzer=my_analyzer)
x_train = ['Apply singular value decomposition to obtain word embeddings and compare with word2vec',
'This is another sentence with singular word thingy']
v.fit(x_train)
print(v.vocabulary_)
[выход]:
{('Apply', 'singular'): 2,
('Apply', 'value'): 4,
('Apply', 'decomposition'): 0,
('Apply', 'to'): 3,
('Apply', 'obtain'): 1,
('singular', 'value'): 47,
('singular', 'decomposition'): 43,
('singular', 'to'): 46,
('singular', 'obtain'): 44,
('singular', 'word'): 48,
('value', 'decomposition'): 54,
('value', 'to'): 57,
('value', 'obtain'): 56,
('value', 'word'): 58,
('value', 'embeddings'): 55,
('decomposition', 'to'): 23,
('decomposition', 'obtain'): 22,
('decomposition', 'word'): 24,
('decomposition', 'embeddings'): 21,
('decomposition', 'and'): 20,
('to', 'obtain'): 52,
('to', 'word'): 53,
('to', 'embeddings'): 51,
('to', 'and'): 49,
('to', 'compare'): 50,
('obtain', 'word'): 38,
('obtain', 'embeddings'): 36,
('obtain', 'and'): 34,
('obtain', 'compare'): 35,
('obtain', 'with'): 37,
('word', 'embeddings'): 65,
('word', 'and'): 63,
('word', 'compare'): 64,
('word', 'with'): 67,
('word', 'word2vec'): 68,
('embeddings', 'and'): 25,
('embeddings', 'compare'): 26,
('embeddings', 'with'): 27,
('embeddings', 'word2vec'): 28,
('and', 'compare'): 10,
('and', 'with'): 11,
('and', 'word2vec'): 12,
('compare', 'with'): 18,
('compare', 'word2vec'): 19,
('with', 'word2vec'): 62,
('This', 'is'): 6,
('This', 'another'): 5,
('This', 'sentence'): 7,
('This', 'with'): 9,
('This', 'singular'): 8,
('is', 'another'): 29,
('is', 'sentence'): 30,
('is', 'with'): 32,
('is', 'singular'): 31,
('is', 'word'): 33,
('another', 'sentence'): 13,
('another', 'with'): 16,
('another', 'singular'): 14,
('another', 'word'): 17,
('another', 'thingy'): 15,
('sentence', 'with'): 41,
('sentence', 'singular'): 39,
('sentence', 'word'): 42,
('sentence', 'thingy'): 40,
('with', 'singular'): 59,
('with', 'word'): 61,
('with', 'thingy'): 60,
('singular', 'thingy'): 45,
('word', 'thingy'): 66}