ValueError: Количество объектов модели должно соответствовать входным данным. Модель n_features - 256, а входные n_features - 240 - PullRequest
0 голосов
/ 26 февраля 2020

Я получаю эту ошибку при попытке прогнозирования с использованием модели. Модель обучена, но дает ошибку в прогнозе. Я построил модель в scikit учиться. Я знаю, что есть множество подобных вопросов и перепробовал много вариантов, но не нашел решения. Вот мои важные коды.

data_set = pd.read_csv('train.csv')
###dropped columns having highly null values.
data_set.drop(['Alley','MiscFeature','Fence','PoolQC','FireplaceQu'], axis= 1, inplace= True)
###Converts columns to datetime
data_set['YearBuilt']= pd.to_datetime(data_set['YearBuilt'],format='%Y')
data_set['YearRemodAdd'] = pd.to_datetime(data_set['YearRemodAdd'],format='%Y')
data_set['GarageYrBlt'] = pd.to_datetime(data_set['GarageYrBlt'],format='%Y')
### merge month and year columns and created new column
data_set['SoldYr'] = data_set['YrSold'].astype('str')+'-'+data_set['MoSold'].astype('str')
### Convert new column to datetime
data_set['SoldYr'] = pd.to_datetime(data_set['SoldYr'])
### create new columns from year columns
from datetime import timedelta
days = data_set['SoldYr']-data_set['YearBuilt']
data_set['Age'] = days/timedelta(days=365)
data_set['Age'] = data_set['Age'].round(2)

days = data_set['SoldYr']-data_set['GarageYrBlt']
data_set['GarageAge'] = days/timedelta(days=365)
data_set['GarageAge'] = data_set['GarageAge'].round(2)

days = data_set['SoldYr']-data_set['YearRemodAdd']
data_set['YearRemodAge'] = days/timedelta(days=365)
data_set['YearRemodAge'] = data_set['YearRemodAge'].round(2)

###drop year columns after creating new columns 
data_set.drop(['YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold','SoldYr'],axis=1, inplace=True)

data_labels = data_set['SalePrice'].copy()
data_set.drop(['Id','SalePrice'], axis=1, inplace=True)

###Create dataframe of categorical variables. 
data_cat = data_set.drop(['LotFrontage','LotArea','MasVnrArea','BsmtUnfSF','TotalBsmtSF',
                             '1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFinSF1','BsmtFinSF2','GarageArea',
                             'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal',
                             'Age','GarageAge','YearRemodAge'], axis=1)

###Create dataframe of numerical variables.
data_num = data_set[['LotArea','BsmtUnfSF','TotalBsmtSF',
                         '1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFinSF1','BsmtFinSF2','GarageArea',
                         'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']].copy()

###Create dataframe of continuous variable
data_cont = data_set[['LotFrontage','MasVnrArea','Age','GarageAge','YearRemodAge']].copy()

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),('ohe',OneHotEncoder())])

from sklearn.preprocessing import StandardScaler
cont_pipeline = Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),('scaler',StandardScaler())])

num_pipeline = Pipeline([('imputer',SimpleImputer(strategy='median')),('scaler',StandardScaler())])

from sklearn.compose import ColumnTransformer
num_attr = list(data_num.columns[data_num.dtypes == 'int64'])
cont_attr = list(data_cont.columns[data_cont.dtypes == 'float64'])
cat_attr = list(data_cat.columns[data_cat.dtypes == 'object'])

full_pipeline = ColumnTransformer([('num', num_pipeline, num_attr)
                                   ,('cont',cont_pipeline, cont_attr)
                                  ,('cat',cat_pipeline, cat_attr)])

data_prepared = full_pipeline.fit_transform(data_set)

from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(max_features='auto')
dtr.fit(data_prepared,data_labels)
###Output after model training
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
                          max_leaf_nodes=None, min_impurity_decrease=0.0,
                          min_impurity_split=None, min_samples_leaf=1,
                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                          presort=False, random_state=None, splitter='best')

some_data = data_set.iloc[:5]
some_label = data_set.iloc[:5]
some_data_prepared = full_pipeline.fit_transform(some_data)
dtr.predict(some_data_prepared)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-115-ef297a636aa8> in <module>
----> 1 dtr.predict(test_data_prepared)

~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in predict(self, X, check_input)
    428         """
    429         check_is_fitted(self, 'tree_')
--> 430         X = self._validate_X_predict(X, check_input)
    431         proba = self.tree_.predict(X)
    432         n_samples = X.shape[0]

~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in _validate_X_predict(self, X, check_input)
    400                              "match the input. Model n_features is %s and "
    401                              "input n_features is %s "
--> 402                              % (self.n_features_, n_features))
    403 
    404         return X

ValueError: Number of features of the model must match the input. Model n_features is 256 and input n_features is 240 

все шло хорошо. Была также обучена модель регрессии, но она не предсказывала никаких значений и не давала ошибок коды и набор данных с описаниями данных доступны на Github. ссылка ниже. https://github.com/sudhanshusaurav/House-Prices-Advanced-Regression-Techniques PLZ, помогите в ближайшее время ..

...