Цель: Предсказать результат на основе int и объектных функций с помощью sklearn.
Я использую следующий набор данных от Kaggle: Soccer Dataset
Вот мой блокнот: Блокнот Kaggle
У меня есть создал конвейер, который почти работает:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
# Read the data
df = total_df.copy()
# Remove rows with missing target
df.dropna(axis=0, subset=['result'], inplace=True)
# Separate target from predictors
y = df.result
X = df.drop(['result'], axis=1)
# Break off validation set from training data
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y,
integer_features = list(X.columns[X.dtypes == 'int64'])
#continuous_features = list(X.columns[X.dtypes == 'float64'])
categorical_features = list(X.columns[X.dtypes == 'object'])
# Keep selected columns only
my_cols = categorical_features + integer_features
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()
integer_transformer = Pipeline(steps = [
('imputer', SimpleImputer(strategy = 'most_frequent')),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
preprocessor = ColumnTransformer(
('ints', integer_transformer, integer_features),
('cat', categorical_transformer, categorical_features)])
base = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
# Preprocessing of training data, fit model
base.fit(X_train, y_train)
Я получил сообщение об ошибке:
ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed
Вот полная трассировка:
KeyError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice)
255 try:
--> 256 return dtype_to_str[type(key)]
257 except KeyError:
KeyError: <class 'sqlalchemy.sql.elements.quoted_name'>
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-13-702987dff390> in <module>
48 # Preprocessing of training data, fit model
---> 49 base.fit(X_train, y_train)
51 base.predict(X_test)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
348 This estimator
349 """
--> 350 Xt, fit_params = self._fit(X, y, **fit_params)
351 with _print_elapsed_time('Pipeline',
352 self._log_message(len(self.steps) - 1)):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
313 message_clsname='Pipeline',
314 message=self._log_message(step_idx),
--> 315 **fit_params_steps[name])
316 # Replace the transformer of the step with the fitted
317 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
357 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
726 with _print_elapsed_time(message_clsname, message):
727 if hasattr(transformer, 'fit_transform'):
--> 728 res = transformer.fit_transform(X, y, **fit_params)
729 else:
730 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
514 self._validate_transformers()
515 self._validate_column_callables(X)
--> 516 self._validate_remainder(X)
518 result = self._fit_transform(X, y, _fit_transform_one)
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _validate_remainder(self, X)
316 if (hasattr(X, 'columns') and
317 any(_determine_key_type(cols) == 'str'
--> 318 for cols in self._columns)):
319 self._df_columns = X.columns
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in <genexpr>(.0)
316 if (hasattr(X, 'columns') and
317 any(_determine_key_type(cols) == 'str'
--> 318 for cols in self._columns)):
319 self._df_columns = X.columns
/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice)
275 if isinstance(key, (list, tuple)):
276 unique_key = set(key)
--> 277 key_type = {_determine_key_type(elt) for elt in unique_key}
278 if not key_type:
279 return None
/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in <setcomp>(.0)
275 if isinstance(key, (list, tuple)):
276 unique_key = set(key)
--> 277 key_type = {_determine_key_type(elt) for elt in unique_key}
278 if not key_type:
279 return None
/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice)
256 return dtype_to_str[type(key)]
257 except KeyError:
--> 258 raise ValueError(err_msg)
259 if isinstance(key, slice):
260 if not accept_slice:
ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed
Любая помощь будет принята с благодарностью!
РЕДАКТИРОВАТЬ: Сообщение об ошибке «разрешен только скаляр, список или фрагмент всех целых чисел или всех строк или логическая маска». integer_features
и categorical_features
- это списки, содержащие только строковые имена столбцов.