Что ж, ваша модель может не подходить для данного набора данных, который вы используете. Вы можете добавить больше данных или настроить параметры, как вы предложили. Вы также можете подумать о переоборудовании или недостаточном оснащении. Что касается параметров, см. Мой пример кода ниже.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
df = pd.read_csv('C:\\your_path\\heart.csv')
df.head()
df.info()
df.isnull().sum()
#Univariate analysis target.
sns.countplot(df['target'])
#Univariate analysis age.
f = plt.figure(figsize=(20,4))
f.add_subplot(1,2,1)
sns.distplot(df['age'])
f.add_subplot(1,2,2)
sns.boxplot(df['age'])
#Univariate analysis resting blood pressure (mm Hg) atau trestbps.f = plt.figure(figsize=(20,4))
f.add_subplot(1,2,1)
sns.distplot(df['trestbps'])
f.add_subplot(1,2,2)
sns.boxplot(df['trestbps'])
#Create KNN Object.
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
knn = KNeighborsClassifier()#Create x and y variables.
x = df.drop(columns=['target'])
y = df['target']#Split data into training and testing.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)#Training the model.
knn.fit(x_train, y_train)#Predict test data set.
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
# Returns a NumPy Array
# Predict for One Observation (image)
#logisticRegr.predict(x_test[0].reshape(1,-1))
logisticRegr.predict(x_test[0:10])
predictions = logisticRegr.predict(x_test)
y_pred = predictions#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)
Результаты:
precision recall f1-score support
0 0.91 0.84 0.87 25
1 0.89 0.94 0.92 36
accuracy 0.90 61
macro avg 0.90 0.89 0.90 61
weighted avg 0.90 0.90 0.90 61
Производительность в порядке, более 90%. Но давайте попробуем использовать настройку гиперпараметров для улучшения производительности нашей модели.
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)#Create new KNN object
knn_2 = KNeighborsClassifier()#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)#Fit the model
best_model = clf.fit(x,y)#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
Результаты:
Best leaf_size: 1
Best p: 1
Best n_neighbors: 7
Теперь давайте воспользуемся знаниями, которые мы накопили выше, чтобы сделать небольшую настройку и перезапустить процесс ...
# train your model using all data and the best known parameters
# instantiate model with best parameters
knn = KNeighborsClassifier(n_neighbors=7, weights='uniform')
# fit with X and y, not X_train and y_train
# even if we use train/test split, we should train on X and y before making predictions on new data
# otherwise we throw away potential valuable data we can learn from
knn.fit(x, y)
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)
# Returns a NumPy Array
# Predict for One Observation (image)
#logisticRegr.predict(x_test[0].reshape(1,-1))
logisticRegr.predict(x_test[0:10])
predictions = logisticRegr.predict(x_test)
y_pred = predictions#Checking performance our model with classification report.
print(classification_report(y_test, y_pred))#Checking performance our model with ROC Score.
roc_auc_score(y_test, y_pred)
Результат:
precision recall f1-score support
0 0.91 0.84 0.87 25
1 0.89 0.94 0.92 36
accuracy 0.90 61
macro avg 0.90 0.89 0.90 61
weighted avg 0.90 0.90 0.90 61
То же самое! В этом случае изменение гиперпараметров не имело никакого значения. В других случаях может наблюдаться небольшое улучшение производительности; 5%, 10% или что угодно. Итак, вывод: KNN хорошо работает на моем конкретном наборе данных c, но, очевидно, он не дает хороших результатов для вашего набора данных, и это совершенно нормально, просто выберите другую модель для тестирования.
# data source:
# https://raw.githubusercontent.com/adiptamartulandi/KNN-and-Tuning-Hyperparameters/master/heart.csv
Я оставлю вам одну последнюю мысль. Вы можете l oop через несколько классификаторов автоматически и посмотреть результаты каждого, затем выбрать 1 или 2 верхних и запустить с ним.
import numpy as np
import pandas as pd
# Load data from UCI dataset repo
bank_note_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
data = np.loadtxt(bank_note_url, delimiter=',')
data = pd.DataFrame(data)
# Add column names
clean_columns = ['variance_of_wavelet', 'skewness_of_wavelet',
'curtosis_of_wavelet', 'entropy_of_wavelet',
'class']
data.columns = clean_columns
data.head()
# Separate the target and features as separate dataframes for sklearn APIs
X = data.drop('class', axis=1)
y = data[['class']].astype('int')
# Specify the design matrix and the target vector for yellowbrick as arrays
design_matrix = X.values
target_vector = y.values.flatten()
X.head()
y.head()
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))])
from sklearn.model_selection import train_test_split
# Stratified sampling based on the distribution of the target vector, y
X_train, X_test, y_train, y_test = train_test_split(X, y,
stratify=y,
test_size=0.20,
random_state=30)
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)])
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="rbf", C=0.025, probability=True),
NuSVC(probability=True),
DecisionTreeClassifier(),
RandomForestClassifier(),
AdaBoostClassifier(),
GradientBoostingClassifier()
]
for classifier in classifiers:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', classifier)])
pipe.fit(X_train, y_train)
print(classifier)
print("model score: %.3f" % pipe.score(X_test, y_test))
param_grid = {
'classifier__n_estimators': [200, 500],
'classifier__max_features': ['auto', 'sqrt', 'log2'],
'classifier__max_depth' : [4,5,6,7,8],
'classifier__criterion' :['gini', 'entropy']}
Результат:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=3, p=2,
weights='uniform')
model score: 1.000
SVC(C=0.025, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
verbose=False)
model score: 0.967
NuSVC(break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
max_iter=-1, nu=0.5, probability=True, random_state=None, shrinking=True,
tol=0.001, verbose=False)
model score: 0.971
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort='deprecated',
random_state=None, splitter='best')
model score: 0.978
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\utils\validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\utils\validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\pipeline.py:354: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
self._final_estimator.fit(Xt, y, **fit_params)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)
model score: 0.993
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
n_estimators=50, random_state=None)
model score: 0.996
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\utils\validation.py:760: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
C:\Users\ryans\Anaconda3\lib\site-packages\sklearn\ensemble\_gb.py:1454: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=3,
max_features=None, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_iter_no_change=None, presort='deprecated',
random_state=None, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False)
model score: 0.993