Апсэмплинг говорит мне, что я не могу сделать повторную выборку с дублированной оси - PullRequest
0 голосов
/ 19 сентября 2019

Для задачи мультиклассовой классификации у меня очень несбалансированный набор данных.Поэтому я попытался уменьшить количество классов большинства, и это сработало хорошо, и уменьшить число классов меньшинств, но этого не произошло.Вот код, который я попробовал:

count_class_A, count_class_B,count_class_C, count_class_D,count_class_E, count_class_F, count_class_G = df.grade.value_counts()
count_df = df.shape[0] 
class_dict = {"A": count_class_A,"B" :count_class_B,"C": count_class_C,"D": count_class_D,"E": count_class_E, "F": count_class_F, "G": count_class_G}
counts = [count_class_A, count_class_B,count_class_C, count_class_D,count_class_E, count_class_F, count_class_G]
median = statistics.median(counts)


for key in class_dict:
    if class_dict[key]>median:
        print(key)
        df[df.grade == key] = df[df.grade == key].sample(int(count_df/7), replace = False) 
                                         #replace=False,    # sample without replacement
                                         #n_samples=int(count_df/7),     # to match minority class
                                         #random_state=123)
    if False:
    #if class_dict[key]<median:
        print("Oooh" + key)
        df[df.grade == key] = resample(df[df.grade == key],
                          replace=True, # sample with replacement
                          n_samples=int(count_df/7), # match number in majority class
                          random_state=27) # reproducible results

Вот сообщение об ошибке:

A
B
C
OoohE
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-166-f7d56375cbb9> in <module>
     27                               replace=True, # sample with replacement
     28                               n_samples=int(count_df/7), # match number in majority class
---> 29                               random_state=27) # reproducible results
     30 
     31 

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __setitem__(self, key, value)
   3114             self._setitem_frame(key, value)
   3115         elif isinstance(key, (Series, np.ndarray, list, Index)):
-> 3116             self._setitem_array(key, value)
   3117         else:
   3118             # set column

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in _setitem_array(self, key, value)
   3132             indexer = key.nonzero()[0]
   3133             self._check_setitem_copy()
-> 3134             self.loc._setitem_with_indexer(indexer, value)
   3135         else:
   3136             if isinstance(value, DataFrame):

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _setitem_with_indexer(self, indexer, value)
    575                             v = self._align_series(
    576                                 tuple(sub_indexer), value[item],
--> 577                                 multiindex_indexer)
    578                         else:
    579                             v = np.nan

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py in _align_series(self, indexer, ser, multiindex_indexer)
    732                         return ser._values.copy()
    733 
--> 734                     return ser.reindex(new_ix)._values
    735 
    736                 # 2 dims

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\series.py in reindex(self, index, **kwargs)
   3323     @Appender(generic._shared_docs['reindex'] % _shared_doc_kwargs)
   3324     def reindex(self, index=None, **kwargs):
-> 3325         return super(Series, self).reindex(index=index, **kwargs)
   3326 
   3327     def drop(self, labels=None, axis=0, index=None, columns=None,

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in reindex(self, *args, **kwargs)
   3687         # perform the reindex on the axes
   3688         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 3689                                   fill_value, copy).__finalize__(self)
   3690 
   3691     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   3705             obj = obj._reindex_with_indexers({axis: [new_index, indexer]},
   3706                                              fill_value=fill_value,
-> 3707                                              copy=copy, allow_dups=False)
   3708 
   3709         return obj

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
   3808                                                 fill_value=fill_value,
   3809                                                 allow_dups=allow_dups,
-> 3810                                                 copy=copy)
   3811 
   3812         if copy and new_data is self._data:

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy)
   4412         # some axes don't allow reindexing with dups
   4413         if not allow_dups:
-> 4414             self.axes[axis]._can_reindex(indexer)
   4415 
   4416         if axis >= self.ndim:

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in _can_reindex(self, indexer)
   3574         # trying to reindex on an axis with duplicates
   3575         if not self.is_unique and len(indexer):
-> 3576             raise ValueError("cannot reindex from a duplicate axis")
   3577 
   3578     def reindex(self, target, method=None, level=None, limit=None,

ValueError: cannot reindex from a duplicate axis

Означает ли это, что я должен создать новую ось с уникальными идентификаторами, чтобы получить случайную повторную выборку?

...