Для задачи мультиклассовой классификации у меня очень несбалансированный набор данных.Поэтому я попытался уменьшить количество классов большинства, и это сработало хорошо, и уменьшить число классов меньшинств, но этого не произошло.Вот код, который я попробовал:
count_class_A, count_class_B,count_class_C, count_class_D,count_class_E, count_class_F, count_class_G = df.grade.value_counts()
count_df = df.shape[0]
class_dict = {"A": count_class_A,"B" :count_class_B,"C": count_class_C,"D": count_class_D,"E": count_class_E, "F": count_class_F, "G": count_class_G}
counts = [count_class_A, count_class_B,count_class_C, count_class_D,count_class_E, count_class_F, count_class_G]
median = statistics.median(counts)
for key in class_dict:
if class_dict[key]>median:
print(key)
df[df.grade == key] = df[df.grade == key].sample(int(count_df/7), replace = False)
#replace=False, # sample without replacement
#n_samples=int(count_df/7), # to match minority class
#random_state=123)
if False:
#if class_dict[key]<median:
print("Oooh" + key)
df[df.grade == key] = resample(df[df.grade == key],
replace=True, # sample with replacement
n_samples=int(count_df/7), # match number in majority class
random_state=27) # reproducible results
Вот сообщение об ошибке:
A
B
C
OoohE
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-166-f7d56375cbb9> in <module>
27 replace=True, # sample with replacement
28 n_samples=int(count_df/7), # match number in majority class
---> 29 random_state=27) # reproducible results
30
31
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
3114 self._setitem_frame(key, value)
3115 elif isinstance(key, (Series, np.ndarray, list, Index)):
-> 3116 self._setitem_array(key, value)
3117 else:
3118 # set column
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _setitem_array(self, key, value)
3132 indexer = key.nonzero()[0]
3133 self._check_setitem_copy()
-> 3134 self.loc._setitem_with_indexer(indexer, value)
3135 else:
3136 if isinstance(value, DataFrame):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
575 v = self._align_series(
576 tuple(sub_indexer), value[item],
--> 577 multiindex_indexer)
578 else:
579 v = np.nan
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
732 return ser._values.copy()
733
--> 734 return ser.reindex(new_ix)._values
735
736 # 2 dims
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
3323 @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
3324 def reindex(self, index=None, **kwargs):
-> 3325 return super(Series, self).reindex(index=index, **kwargs)
3326
3327 def drop(self, labels=None, axis=0, index=None, columns=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
3687 # perform the reindex on the axes
3688 return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3689 fill_value, copy).__finalize__(self)
3690
3691 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
3705 obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
3706 fill_value=fill_value,
-> 3707 copy=copy, allow_dups=False)
3708
3709 return obj
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
3808 fill_value=fill_value,
3809 allow_dups=allow_dups,
-> 3810 copy=copy)
3811
3812 if copy and new_data is self._data:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
4412 # some axes don't allow reindexing with dups
4413 if not allow_dups:
-> 4414 self.axes[axis]._can_reindex(indexer)
4415
4416 if axis >= self.ndim:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _can_reindex(self, indexer)
3574 # trying to reindex on an axis with duplicates
3575 if not self.is_unique and len(indexer):
-> 3576 raise ValueError("cannot reindex from a duplicate axis")
3577
3578 def reindex(self, target, method=None, level=None, limit=None,
ValueError: cannot reindex from a duplicate axis
Означает ли это, что я должен создать новую ось с уникальными идентификаторами, чтобы получить случайную повторную выборку?