Я кодирую категориальные данные, используя sklearn LabelEncoder для запуска регрессии дерева решений / регрессии случайных лесов. Однако когда я запускаю его, он меняет мою первую строку данных (метки столбцов) на числа, которые мне не нужны. Как мне избежать этого? Мой код выглядит следующим образом:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv("..Desktop/train.csv")
print(df.describe())
# impute nulls for GarageYrBlt with column mean
df['GarageYrBlt'] = df['GarageYrBlt'].fillna((df['GarageYrBlt'].mean()))
# double check for nulls
print(df.isnull().sum())
# set dependent variable
y = df.SalePrice
# columns to use for independent variables
col = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition']
features = df.iloc[:,:-1].values
print('Features: ', features)
# encoding categorical variables
encode = LabelEncoder()
features[:,1] = encode.fit_transform(features[:,1])
features[:,4] = encode.fit_transform(features[:,4])
features[:,5] = encode.fit_transform(features[:,5])
features[:,6] = encode.fit_transform(features[:,6])
features[:,7] = encode.fit_transform(features[:,7])
features[:,8] = encode.fit_transform(features[:,8])
features[:,9] = encode.fit_transform(features[:,9])
features[:,10] = encode.fit_transform(features[:,10])
# etc etc
print('New features test: ', features)
# set new dataframe
df1 = pd.DataFrame(features)
# check new dataframe
df1.head()
# set independent variable
x = df1[col]
# Specify Model
ames_model = DecisionTreeRegressor(random_state=1)
# Fit Model
ames_model.fit(x, y)