Я пытаюсь запустить QSAR, используя sklearn и pandas. Вот мой код:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
FC_data = pd.read_excel('C:\\Users\\Dre\\Desktop\\My Papers\\Furocoumarins_paper_2018\\Furocoumarins_NEW1.xlsx', index_col=0)
FC_data.head()
# Create correlation matrix
corr_matrix = FC_data.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop features
FC_data1 = FC_data.drop(FC_data[to_drop], axis=1)
y = FC_data1.LogFiT
X = FC_data1.drop(['LogFiT', 'LogS'], axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
randomforest = RandomForestRegressor(n_jobs=-1)
selector = SelectFromModel(randomforest)
features_important = selector.fit_transform(X_train, y_train)
model = randomforest.fit(features_important, y_train)
from sklearn.model_selection import GridSearchCV
clf_rf = RandomForestRegressor()
parameters = {"n_estimators":[1, 2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 50], "max_depth":[1, 2, 3, 4, 5, 10, 15]}
grid_search_cv_clf = GridSearchCV(model, parameters, cv=5)
grid_search_cv_clf.fit(features_important, y_train)
from sklearn.metrics import r2_score
y_pred = grid_search_cv_clf.predict(features_important)
r2_score(y_train, y_pred)
grid_search_cv_clf.best_params_
best_clf = grid_search_cv_clf.best_estimator_
best_clf.score(X_test, y_test)
Ошибка: ValueError: Количество характеристик модели должно совпадать со входом. Model n_features - 22, а input n_features - 114
feature_importances = best_clf.feature_importances_
feature_importances_df = pd.DataFrame({'features':list(X_train),
'feature_importances':feature_importances})
importances = feature_importances_df.sort_values('feature_importances', ascending=False)
importances.head(20)
Это выдает ошибку: ValueError: все массивы должны быть одинаковой длины
Я понимаю, что проблема заключается в различном количестве функций в features_important (X_train) и x_test, но я не знаю, как это исправить. Пожалуйста, помогите!