Question

Цель: Предсказать результат на основе int и объектных функций с помощью sklearn.

Я использую следующий набор данных от Kaggle: Soccer Dataset

Библиотеки

scikit-learn == 0.22.1

У меня есть создал конвейер, который почти работает:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Read the data
df = total_df.copy()

# Remove rows with missing target
df.dropna(axis=0, subset=['result'], inplace=True)

# Separate target from predictors
y = df.result         
X = df.drop(['result'], axis=1)

# Break off validation set from training data
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y,
                                                                train_size=0.8,
                                                                test_size=0.2,
                                                                random_state=0)

integer_features = list(X.columns[X.dtypes == 'int64'])
#continuous_features = list(X.columns[X.dtypes == 'float64'])
categorical_features = list(X.columns[X.dtypes == 'object'])

# Keep selected columns only
my_cols = categorical_features + integer_features
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

integer_transformer = Pipeline(steps = [
   ('imputer', SimpleImputer(strategy = 'most_frequent')),
   ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
   transformers=[
       ('ints', integer_transformer, integer_features),
       ('cat', categorical_transformer, categorical_features)])

base = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', RandomForestClassifier())])

# Preprocessing of training data, fit model 
base.fit(X_train, y_train)

Я получил сообщение об ошибке:

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

Вот полная трассировка:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice)
    255         try:
--> 256             return dtype_to_str[type(key)]
    257         except KeyError:

KeyError: <class 'sqlalchemy.sql.elements.quoted_name'>

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-13-702987dff390> in <module>
     47 
     48 # Preprocessing of training data, fit model
---> 49 base.fit(X_train, y_train)
     50 
     51 base.predict(X_test)

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    348             This estimator
    349         """
--> 350         Xt, fit_params = self._fit(X, y, **fit_params)
    351         with _print_elapsed_time('Pipeline',
    352                                  self._log_message(len(self.steps) - 1)):

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    313                 message_clsname='Pipeline',
    314                 message=self._log_message(step_idx),
--> 315                 **fit_params_steps[name])
    316             # Replace the transformer of the step with the fitted
    317             # transformer. This is necessary when loading the transformer

/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    353 
    354     def __call__(self, *args, **kwargs):
--> 355         return self.func(*args, **kwargs)
    356 
    357     def call_and_shelve(self, *args, **kwargs):

/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    726     with _print_elapsed_time(message_clsname, message):
    727         if hasattr(transformer, 'fit_transform'):
--> 728             res = transformer.fit_transform(X, y, **fit_params)
    729         else:
    730             res = transformer.fit(X, y, **fit_params).transform(X)

/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    514         self._validate_transformers()
    515         self._validate_column_callables(X)
--> 516         self._validate_remainder(X)
    517 
    518         result = self._fit_transform(X, y, _fit_transform_one)

/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _validate_remainder(self, X)
    316         if (hasattr(X, 'columns') and
    317                 any(_determine_key_type(cols) == 'str'
--> 318                     for cols in self._columns)):
    319             self._df_columns = X.columns
    320 

/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in <genexpr>(.0)
    316         if (hasattr(X, 'columns') and
    317                 any(_determine_key_type(cols) == 'str'
--> 318                     for cols in self._columns)):
    319             self._df_columns = X.columns
    320 

/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice)
    275     if isinstance(key, (list, tuple)):
    276         unique_key = set(key)
--> 277         key_type = {_determine_key_type(elt) for elt in unique_key}
    278         if not key_type:
    279             return None

/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in <setcomp>(.0)
    275     if isinstance(key, (list, tuple)):
    276         unique_key = set(key)
--> 277         key_type = {_determine_key_type(elt) for elt in unique_key}
    278         if not key_type:
    279             return None

/opt/conda/lib/python3.7/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice)
    256             return dtype_to_str[type(key)]
    257         except KeyError:
--> 258             raise ValueError(err_msg)
    259     if isinstance(key, slice):
    260         if not accept_slice:

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

Любая помощь будет принята с благодарностью!

РЕДАКТИРОВАТЬ: Сообщение об ошибке «разрешен только скаляр, список или фрагмент всех целых чисел или всех строк или логическая маска». integer_features и categorical_features - это списки, содержащие только строковые имена столбцов.

Vishal Tennyson · Answer 1 · 06 мая 2020

Вы использовали список для integer_features и категориальных функций, тогда как Transformer требует типа Index.

categorical_features = X.select_dtypes(include="object").columns
integer_features = X.select_dtypes(exclude="object").columns

Изменение этого параметра решит вашу ошибку. :)

thomaskolasa · Answer 2 · 06 мая 2020

В пределах transformers из ColumnTransformer оказывается, что вы не можете использовать список строк имен столбцов для integer_features или categorical_features. Если вы измените их на списки индексов столбцов numeri c, что означает что-то вроде integer_features = [5,6] и categorical_features = [0, 1, 2, 3, 4], это должно работать.

Какая «действительная спецификация столбцов» необходима для конвейера классификатора sklearn?

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Какая «действительная спецификация столбцов» необходима для конвейера классификатора sklearn?

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

Ответы [ 2 ]

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы