индекс 1 выходит за границы для оси 1 с размером 1 при построении кривой обучения для КатегориальногоNB - PullRequest
0 голосов
/ 24 апреля 2020

Я пытаюсь построить кривую обучения для CategoricalNB. Я смог построить кривую обучения для GaussianNB. Оба они доступны в scikit-learn под sklearn.naive_bayes. Но когда я пытаюсь построить learning Curve с CategoricalNB, я получаю следующую ошибку. Я использовал Jupyter Notebook с Python 3 для этого. И последняя sklearn версия.

IndexError                                Traceback (most recent call last)
<ipython-input-56-9c215c7c82f9> in <module>
      7 cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=0)
      8 estimator = CategoricalNB()
----> 9 train_sizes, train_scores, test_scores, fit_times, _= learning_curve(estimator, xTrain, yTrain, cv=cv,return_times=True)
     10 ax[0].grid
     11 train_scores_mean = np.mean(train_scores, axis=1)

D:\Languages\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in learning_curve(estimator, X, y, groups, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose, shuffle, random_state, error_score, return_times)
   1254             parameters=None, fit_params=None, return_train_score=True,
   1255             error_score=error_score, return_times=return_times)
-> 1256             for train, test in train_test_proportions)
   1257         out = np.array(out)
   1258         n_cv_folds = out.shape[0] // n_unique_ticks

D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1002             # remaining jobs.
   1003             self._iterating = False
-> 1004             if self.dispatch_one_batch(iterator):
   1005                 self._iterating = self._original_iterator is not None
   1006 

D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    833                 return False
    834             else:
--> 835                 self._dispatch(tasks)
    836                 return True
    837 

D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    752         with self._lock:
    753             job_idx = len(self._jobs)
--> 754             job = self._backend.apply_async(batch, callback=cb)
    755             # A job can complete so quickly than its callback is
    756             # called before we get here, causing self._jobs to

D:\Languages\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    207     def apply_async(self, func, callback=None):
    208         """Schedule a func to be run"""
--> 209         result = ImmediateResult(func)
    210         if callback:
    211             callback(result)

D:\Languages\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    588         # Don't delay the application, to avoid keeping the input
    589         # arguments in memory
--> 590         self.results = batch()
    591 
    592     def get(self):

D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

D:\Languages\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    254         with parallel_backend(self._backend, n_jobs=self._n_jobs):
    255             return [func(*args, **kwargs)
--> 256                     for func, args, kwargs in self.items]
    257 
    258     def __len__(self):

D:\Languages\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    542     else:
    543         fit_time = time.time() - start_time
--> 544         test_scores = _score(estimator, X_test, y_test, scorer)
    545         score_time = time.time() - start_time - fit_time
    546         if return_train_score:

D:\Languages\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
    589         scores = scorer(estimator, X_test)
    590     else:
--> 591         scores = scorer(estimator, X_test, y_test)
    592 
    593     error_msg = ("scoring must return a number, got %s (%s) "

D:\Languages\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
    369 def _passthrough_scorer(estimator, *args, **kwargs):
    370     """Function that wraps estimator.score"""
--> 371     return estimator.score(*args, **kwargs)
    372 
    373 

D:\Languages\Anaconda3\lib\site-packages\sklearn\base.py in score(self, X, y, sample_weight)
    367         """
    368         from .metrics import accuracy_score
--> 369         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
    370 
    371 

D:\Languages\Anaconda3\lib\site-packages\sklearn\naive_bayes.py in predict(self, X)
     75         check_is_fitted(self)
     76         X = self._check_X(X)
---> 77         jll = self._joint_log_likelihood(X)
     78         return self.classes_[np.argmax(jll, axis=1)]
     79 

D:\Languages\Anaconda3\lib\site-packages\sklearn\naive_bayes.py in _joint_log_likelihood(self, X)
   1217         for i in range(self.n_features_):
   1218             indices = X[:, i]
-> 1219             jll += self.feature_log_prob_[i][:, indices].T
   1220         total_ll = jll + self.class_log_prior_
   1221         return total_ll

IndexError: index 1 is out of bounds for axis 1 with size 1

Ниже приведен код, который я использовал для построения кривой обучения.

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.naive_bayes import CategoricalNB

fig,ax = plt.subplots(3,1,figsize=(10, 15))
title = "Learning Curves (Naive Bayes)"
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = CategoricalNB()
train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(estimator, xTrain, yTrain, cv=cv,return_times=True)
ax[0].grid()
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
fit_times_mean = np.mean(fit_times, axis=1)
fit_times_std = np.std(fit_times, axis=1)
ax[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
ax[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
ax[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
ax[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
ax[0].legend(loc="best")

# Plot n_samples vs fit_times
ax[1].grid()
ax[1].plot(train_sizes, fit_times_mean, 'o-')
ax[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
fit_times_mean + fit_times_std, alpha=0.1)
ax[1].set_xlabel("Training examples")
ax[1].set_ylabel("fit_times")
ax[1].set_title("Scalability of the model")

# Plot fit_time vs score
ax[2].grid()
ax[2].plot(fit_times_mean, test_scores_mean, 'o-')
ax[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
ax[2].set_xlabel("fit_times")
ax[2].set_ylabel("Score")
ax[2].set_title("Performance of the model")

plt.show()

Может ли кто-нибудь помочь мне разобраться в проблеме?

Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...