Подробная информация о том, как Scikit-Learn рассчитывает tfidf, доступна здесь , и вот пример его реализации с использованием слова n-грамм.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Train the vectorizer
text="this is a simple example"
singleTFIDF = TfidfVectorizer(ngram_range=(1,2)).fit([text])
singleTFIDF.vocabulary_ # show the word-matrix position pairs
# Analyse the training string - text
single=singleTFIDF.transform([text])
single.toarray() # displays the resulting matrix - all values are equal because all terms are present
# Analyse two new strings with the trained vectorizer
doc_1 = ['is this example working', 'hopefully it is a good example', 'no matching words here']
query = singleTFIDF.transform(doc_1)
query.toarray() # displays the resulting matrix - only matched terms have non-zero values
# Compute the cosine similarity between text and doc_1 - the second string has only two matching terms, therefore it has a lower similarity value
cos_similarity = cosine_similarity(single.A, query.A)
Вывод:
singleTFIDF.vocabulary_
Out[297]:
{'this': 5,
'is': 1,
'simple': 3,
'example': 0,
'this is': 6,
'is simple': 2,
'simple example': 4}
single.toarray()
Out[299]:
array([[0.37796447, 0.37796447, 0.37796447, 0.37796447, 0.37796447,
0.37796447, 0.37796447]])
query.toarray()
Out[311]:
array([[0.57735027, 0.57735027, 0. , 0. , 0. ,
0.57735027, 0. ],
[0.70710678, 0.70710678, 0. , 0. , 0. ,
0. , 0. ],
[0. , 0. , 0. , 0. , 0. ,
0. , 0. ]])
np.sum(np.square(query.toarray()), axis=1) # note how all rows with non-zero scores have been normalised to 1.
Out[3]: array([1., 1., 0.])
cos_similarity
Out[313]: array([[0.65465367, 0.53452248, 0. ]])