Я делаю выбор функции с помощью метода хи-квадрат на Python, однако у меня возникли некоторые проблемы с последним блоком кода, я использую набор данных, составленный только из категориальных переменных, исходный набор данных состоит из переменных имена (столбцы) и 'yes' и 'no' (строки) для каждого наблюдения.
Вот ошибка:
TypeError Traceback (most recent call last)
<ipython-input-14-3b4cc24c7499> in <module>
1 fs = SelectKBest(score_func=chi2, k='all')
----> 2 fs.fit(X_train, y_train)
3 X_train_fs = fs.transform(X_train)
4 X_test_fs = fs.transform(X_test)
~\Documents\Nueva carpeta\lib\site-packages\sklearn\feature_selection\univariate_selection.py in fit(self, X, y)
347
348 self._check_params(X, y)
--> 349 score_func_ret = self.score_func(X, y)
350 if isinstance(score_func_ret, (list, tuple)):
351 self.scores_, self.pvalues_ = score_func_ret
~\Documents\Nueva carpeta\lib\site-packages\sklearn\feature_selection\univariate_selection.py in chi2(X, y)
213 # numerical stability.
214 X = check_array(X, accept_sparse='csr')
--> 215 if np.any((X.data if issparse(X) else X) < 0):
216 raise ValueError("Input X must be non-negative.")
217
TypeError: '<' not supported between instances of 'numpy.ndarray' and 'int'
А вот код, который я сейчас использую:
# example of chi squared feature selection for categorical data
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from matplotlib import pyplot
# load the dataset
def load_dataset(filename):
# load the dataset as a pandas DataFrame
data = read_csv(filename, header=None)
# retrieve numpy array
dataset = data.values
# split into input (X) and output (y) variables
X = dataset[:, :-1]
y = dataset[:,-1]
# format all fields as string
X = X.astype(str)
return X, y
# prepare input data
def prepare_inputs(X_train, X_test):
oe = OrdinalEncoder()
oe.fit(X_train)
X_train_enc = oe.transform(X_train)
X_test_enc = oe.transform(X_test)
return X_train_enc, X_test_enc
# prepare target
def prepare_targets(y_train, y_test):
le = LabelEncoder()
le.fit(y_train)
y_train_enc = le.transform(y_train)
y_test_enc = le.transform(y_test)
return y_train_enc, y_test_enc
# feature selection
def select_features(X_train, y_train, X_test):
fs = SelectKBest(score_func=chi2, k='all')
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fs #This is the block where the problem arises.
Заранее спасибо