У меня есть таблица с данными об ошибках, которые необходимо исправить.
Test df
df = pd.DataFrame({
'store_id' : list('aaaabbbbcccc'),
'product_id' : list('111122223333'),
'time_create' : (1,1,1,3,1,1,2,2,10,11,12,13),
'store_product_quantity_old' : (0,0,0,3,0,0,5,5, 0,1,2,3),
'store_product_quantity_new' : (1,1,1,5,2,3,4,10,1,2,3,4)
})
Выбрать все дубликаты с помощью ['store_id', 'product_id', 'time_create']
dups = df[df.duplicated(subset=['store_id', 'product_id', 'time_create'], keep=False)].copy()
Рассчитать реальное значение diff
dups.loc[:, 'quantity_diff'] = dups.store_product_quantity_new - dups.store_product_quantity_old
a = dups.groupby(['store_id', 'product_id', 'time_create']).agg({'quantity_diff': 'sum'} )
Удаление дубликатов, кроме первого
x = df.drop(df[df.duplicated(subset=['store_id', 'product_id', 'time_create'])].index)
Добавление значения diff к предыдущей исключенной строке
x = x.set_index(['store_id', 'product_id', 'time_create'])
x.iloc[a.index].store_product_quantity_new = x.iloc[a.index].store_product_quantity_old + a.quantity_diff
Но на последнем шаге это не удалось:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-129-0183d1586485> in <module>()
----> 1 x.iloc[a.index].store_product_quantity_new = x.store_product_quantity_old + a.quantity_diff
C:\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1476
1477 maybe_callable = com._apply_if_callable(key, self.obj)
-> 1478 return self._getitem_axis(maybe_callable, axis=axis)
1479
1480 def _is_scalar_access(self, key):
C:\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
2089 # a list of integers
2090 elif is_list_like_indexer(key):
-> 2091 return self._get_list_axis(key, axis=axis)
2092
2093 # a single integer
C:\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_list_axis(self, key, axis)
2068 axis = self.axis or 0
2069 try:
-> 2070 return self.obj._take(key, axis=axis)
2071 except IndexError:
2072 # re-raise with different error message
C:\Anaconda3\lib\site-packages\pandas\core\generic.py in _take(self, indices, axis, is_copy)
2787 new_data = self._data.take(indices,
2788 axis=self._get_block_manager_axis(axis),
-> 2789 verify=True)
2790 result = self._constructor(new_data).__finalize__(self)
2791
C:\Anaconda3\lib\site-packages\pandas\core\internals.py in take(self, indexer, axis, verify, convert)
4524 dtype='int64')
4525 if isinstance(indexer, slice)
-> 4526 else np.asanyarray(indexer, dtype='int64'))
4527
4528 n = self.shape[axis]
C:\Anaconda3\lib\site-packages\numpy\core\numeric.py in asanyarray(a, dtype, order)
542
543 """
--> 544 return array(a, dtype, copy=False, order=order, subok=True)
545
546
ValueError: setting an array element with a sequence.