Вместо того, чтобы устанавливать два TfidfVectorizer и затем пытаться объединить их, объедините ваши текстовые данные построчно и затем передайте их одному TfidfVectorizer.
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
fruit = ['apple', 'banana', 'pear', 'kiwi']
vegetables = ['tomatoes', 'peppers', 'broccoli', 'carrots']
df = pd.DataFrame(
{'Fruit': fruit, 'Vegetables': vegetables, 'Integers': np.arange(1, 5)})
# Select text data and join them along each row
def prepare_text_data(data):
text_cols = [col for col in data.columns if (df[col].dtype == 'object')]
text_data = data[text_cols].apply(lambda x: ' '.join(x), axis=1)
return text_data
pipeline = Pipeline([
('text_selector', FunctionTransformer(prepare_text_data,
validate=False)),
('vectorizer', TfidfVectorizer())])
pipeline = pipeline.fit(df)
tfidf = pipeline.transform(df)
# Check the vocabulary to verify it contains all tokens from df
pipeline['vectorizer'].vocabulary_
Out[39]:
{'apple': 0,
'tomatoes': 7,
'banana': 1,
'peppers': 6,
'pear': 5,
'broccoli': 2,
'kiwi': 4,
'carrots': 3}
# Here is the resulting Tfidf matrix with 4 rows and 8 columns corresponding to
# the number of rows in the df and the number of tokens in the Tfidf vocabulary
tfidf.A
Out[40]:
array([[0.70710678, 0. , 0. , 0. , 0. ,
0. , 0. , 0.70710678],
[0. , 0.70710678, 0. , 0. , 0. ,
0. , 0.70710678, 0. ],
[0. , 0. , 0.70710678, 0. , 0. ,
0.70710678, 0. , 0. ],
[0. , 0. , 0. , 0.70710678, 0.70710678,
0. , 0. , 0. ]])