У меня есть набор данных по дисбалансу для задачи классификации. Моя целевая переменная является двоичной и имеет две категории. Я реализовал Random Forest и Logisti c Regression, назначив class_weights в качестве параметра. Когда я подгоняю данные к случайному лесу и регрессии логистики c по отдельности, это работает нормально. Но когда я использую классификатор голосования по случайному лесу и регрессию logisti c из sklearn.ensemble для подбора данных, это дает ошибку Class label no_payment not present.
, мне нужно взять ансамбль из 3 или более моделей. Я проверил, что эта ошибка не из-за naive_bayes, реализованного в коде.
Мой код:
rf_param = { 'class_weight': {'no_payment': 1, 'payment': 3},'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 30, 'min_samples_split': 15, 'n_estimators': 100}
lr_param = {'C': 0.1, 'class_weight': {'no_payment': 1, 'payment': 3}, 'fit_intercept': False, 'penalty': 'l2'}
rf = ensemble.RandomForestClassifier(**rf_param)
lr = linear_model.LogisticRegression(**lr_param)
nb = naive_bayes.MultinomialNB(alpha=0.0, class_prior=None, fit_prior=False)
rf.fit(train_x, train_y)
lr.fit(train_x, train_y)
nb.fit(train_x, train_y)
model = ensemble.VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('nb',nb)], voting='hard'
,weights = [2,2,1])
model.fit(train_x, train_y)
predictions = model.predict(valid_x)
Этот код отлично работает, если я удаляю class_weight
из списка параметров.
Ниже приведено полное сообщение об ошибке.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-35-e05cd516f347> in <module>()
15 )
16
---> 17 model.fit(train_x, train_y)
18
19 predictions = model.predict(valid_x)
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_voting.py in fit(self, X, y, sample_weight)
220 transformed_y = self.le_.transform(y)
221
--> 222 return super().fit(X, transformed_y, sample_weight)
223
224 def predict(self, X):
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_voting.py in fit(self, X, y, sample_weight)
66 delayed(_parallel_fit_estimator)(clone(clf), X, y,
67 sample_weight=sample_weight)
---> 68 for clf in clfs if clf not in (None, 'drop')
69 )
70
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
/home/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
/home/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/home/.local/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_base.py in _parallel_fit_estimator(estimator, X, y, sample_weight)
34 raise
35 else:
---> 36 estimator.fit(X, y)
37 return estimator
38
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in fit(self, X, y, sample_weight)
319 self.n_outputs_ = y.shape[1]
320
--> 321 y, expanded_class_weight = self._validate_y_class_weight(y)
322
323 if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
/home/.local/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in _validate_y_class_weight(self, y)
585 class_weight = self.class_weight
586 expanded_class_weight = compute_sample_weight(class_weight,
--> 587 y_original)
588
589 return y, expanded_class_weight
/home/.local/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_sample_weight(class_weight, y, indices)
161 weight_k = compute_class_weight(class_weight_k,
162 classes_full,
--> 163 y_full)
164
165 weight_k = weight_k[np.searchsorted(classes_full, y_full)]
/home/.local/lib/python3.6/site-packages/sklearn/utils/class_weight.py in compute_class_weight(class_weight, classes, y)
63 i = np.searchsorted(classes, c)
64 if i >= len(classes) or classes[i] != c:
---> 65 raise ValueError("Class label {} not present.".format(c))
66 else:
67 weight[i] = class_weight[c]
ValueError: Class label no_payment not present.