Я пытаюсь построить кривую обучения для CategoricalNB
. Я смог построить кривую обучения для GaussianNB
. Оба они доступны в scikit-learn
под sklearn.naive_bayes
. Но когда я пытаюсь построить learning Curve
с CategoricalNB
, я получаю следующую ошибку. Я использовал Jupyter Notebook
с Python 3
для этого. И последняя sklearn
версия.
IndexError Traceback (most recent call last)
<ipython-input-56-9c215c7c82f9> in <module>
7 cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=0)
8 estimator = CategoricalNB()
----> 9 train_sizes, train_scores, test_scores, fit_times, _= learning_curve(estimator, xTrain, yTrain, cv=cv,return_times=True)
10 ax[0].grid
11 train_scores_mean = np.mean(train_scores, axis=1)
D:\Languages\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in learning_curve(estimator, X, y, groups, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose, shuffle, random_state, error_score, return_times)
1254 parameters=None, fit_params=None, return_train_score=True,
1255 error_score=error_score, return_times=return_times)
-> 1256 for train, test in train_test_proportions)
1257 out = np.array(out)
1258 n_cv_folds = out.shape[0] // n_unique_ticks
D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
D:\Languages\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
D:\Languages\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
D:\Languages\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
D:\Languages\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
589 scores = scorer(estimator, X_test)
590 else:
--> 591 scores = scorer(estimator, X_test, y_test)
592
593 error_msg = ("scoring must return a number, got %s (%s) "
D:\Languages\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
369 def _passthrough_scorer(estimator, *args, **kwargs):
370 """Function that wraps estimator.score"""
--> 371 return estimator.score(*args, **kwargs)
372
373
D:\Languages\Anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight)
367 """
368 from .metrics import accuracy_score
--> 369 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
370
371
D:\Languages\Anaconda3\lib\site-packages\sklearn\naive_bayes.py in predict(self, X)
75 check_is_fitted(self)
76 X = self._check_X(X)
---> 77 jll = self._joint_log_likelihood(X)
78 return self.classes_[np.argmax(jll, axis=1)]
79
D:\Languages\Anaconda3\lib\site-packages\sklearn\naive_bayes.py in _joint_log_likelihood(self, X)
1217 for i in range(self.n_features_):
1218 indices = X[:, i]
-> 1219 jll += self.feature_log_prob_[i][:, indices].T
1220 total_ll = jll + self.class_log_prior_
1221 return total_ll
IndexError: index 1 is out of bounds for axis 1 with size 1
Ниже приведен код, который я использовал для построения кривой обучения.
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.naive_bayes import CategoricalNB
fig,ax = plt.subplots(3,1,figsize=(10, 15))
title = "Learning Curves (Naive Bayes)"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = CategoricalNB()
train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(estimator, xTrain, yTrain, cv=cv,return_times=True)
ax[0].grid()
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
ax[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
ax[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1,
color="g")
ax[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
ax[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
ax[0].legend(loc="best")
# Plot n_samples vs fit_times
ax[1].grid()
ax[1].plot(train_sizes, fit_times_mean, 'o-')
ax[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
fit_times_mean + fit_times_std, alpha=0.1)
ax[1].set_xlabel("Training examples")
ax[1].set_ylabel("fit_times")
ax[1].set_title("Scalability of the model")
# Plot fit_time vs score
ax[2].grid()
ax[2].plot(fit_times_mean, test_scores_mean, 'o-')
ax[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1)
ax[2].set_xlabel("fit_times")
ax[2].set_ylabel("Score")
ax[2].set_title("Performance of the model")
plt.show()
Может ли кто-нибудь помочь мне разобраться в проблеме?