Я получаю эту ошибку при попытке прогнозирования с использованием модели. Модель обучена, но дает ошибку в прогнозе. Я построил модель в scikit учиться. Я знаю, что есть множество подобных вопросов и перепробовал много вариантов, но не нашел решения. Вот мои важные коды.
data_set = pd.read_csv('train.csv')
###dropped columns having highly null values.
data_set.drop(['Alley','MiscFeature','Fence','PoolQC','FireplaceQu'], axis= 1, inplace= True)
###Converts columns to datetime
data_set['YearBuilt']= pd.to_datetime(data_set['YearBuilt'],format='%Y')
data_set['YearRemodAdd'] = pd.to_datetime(data_set['YearRemodAdd'],format='%Y')
data_set['GarageYrBlt'] = pd.to_datetime(data_set['GarageYrBlt'],format='%Y')
### merge month and year columns and created new column
data_set['SoldYr'] = data_set['YrSold'].astype('str')+'-'+data_set['MoSold'].astype('str')
### Convert new column to datetime
data_set['SoldYr'] = pd.to_datetime(data_set['SoldYr'])
### create new columns from year columns
from datetime import timedelta
days = data_set['SoldYr']-data_set['YearBuilt']
data_set['Age'] = days/timedelta(days=365)
data_set['Age'] = data_set['Age'].round(2)
days = data_set['SoldYr']-data_set['GarageYrBlt']
data_set['GarageAge'] = days/timedelta(days=365)
data_set['GarageAge'] = data_set['GarageAge'].round(2)
days = data_set['SoldYr']-data_set['YearRemodAdd']
data_set['YearRemodAge'] = days/timedelta(days=365)
data_set['YearRemodAge'] = data_set['YearRemodAge'].round(2)
###drop year columns after creating new columns
data_set.drop(['YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold','SoldYr'],axis=1, inplace=True)
data_labels = data_set['SalePrice'].copy()
data_set.drop(['Id','SalePrice'], axis=1, inplace=True)
###Create dataframe of categorical variables.
data_cat = data_set.drop(['LotFrontage','LotArea','MasVnrArea','BsmtUnfSF','TotalBsmtSF',
'1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFinSF1','BsmtFinSF2','GarageArea',
'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal',
'Age','GarageAge','YearRemodAge'], axis=1)
###Create dataframe of numerical variables.
data_num = data_set[['LotArea','BsmtUnfSF','TotalBsmtSF',
'1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','BsmtFinSF1','BsmtFinSF2','GarageArea',
'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']].copy()
###Create dataframe of continuous variable
data_cont = data_set[['LotFrontage','MasVnrArea','Age','GarageAge','YearRemodAge']].copy()
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
cat_pipeline = Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),('ohe',OneHotEncoder())])
from sklearn.preprocessing import StandardScaler
cont_pipeline = Pipeline([('imputer',SimpleImputer(strategy='most_frequent')),('scaler',StandardScaler())])
num_pipeline = Pipeline([('imputer',SimpleImputer(strategy='median')),('scaler',StandardScaler())])
from sklearn.compose import ColumnTransformer
num_attr = list(data_num.columns[data_num.dtypes == 'int64'])
cont_attr = list(data_cont.columns[data_cont.dtypes == 'float64'])
cat_attr = list(data_cat.columns[data_cat.dtypes == 'object'])
full_pipeline = ColumnTransformer([('num', num_pipeline, num_attr)
,('cont',cont_pipeline, cont_attr)
,('cat',cat_pipeline, cat_attr)])
data_prepared = full_pipeline.fit_transform(data_set)
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor(max_features='auto')
dtr.fit(data_prepared,data_labels)
###Output after model training
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
some_data = data_set.iloc[:5]
some_label = data_set.iloc[:5]
some_data_prepared = full_pipeline.fit_transform(some_data)
dtr.predict(some_data_prepared)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-115-ef297a636aa8> in <module>
----> 1 dtr.predict(test_data_prepared)
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in predict(self, X, check_input)
428 """
429 check_is_fitted(self, 'tree_')
--> 430 X = self._validate_X_predict(X, check_input)
431 proba = self.tree_.predict(X)
432 n_samples = X.shape[0]
~\Anaconda3\lib\site-packages\sklearn\tree\tree.py in _validate_X_predict(self, X, check_input)
400 "match the input. Model n_features is %s and "
401 "input n_features is %s "
--> 402 % (self.n_features_, n_features))
403
404 return X
ValueError: Number of features of the model must match the input. Model n_features is 256 and input n_features is 240
все шло хорошо. Была также обучена модель регрессии, но она не предсказывала никаких значений и не давала ошибок коды и набор данных с описаниями данных доступны на Github. ссылка ниже. https://github.com/sudhanshusaurav/House-Prices-Advanced-Regression-Techniques PLZ, помогите в ближайшее время ..