Python: кадр данных с начальной и конечной датой, распаковка в 1 поле даты - PullRequest
1 голос
/ 11 марта 2020

У меня есть такой фрейм данных * EDITED

StartDate EndDate Company Location
2019-01-15  2019-01-31  1.0 121.0
2019-02-01  2020-03-10  1.0 136.0
2006-10-02  2020-03-10  2.0 136.0
2003-07-31  2020-03-10  2.0 321.0
2010-11-03  2020-03-10  3.0 322.0
2013-02-01  2017-02-07  4.0 375.0
2017-02-08  2019-01-14  4.0 375.0
2019-01-15  2019-04-29  4.0 375.0
2019-04-30  2020-03-10  4.0 375.0

Как упомянуто в этой ссылке: Pandas: диапазон декомпрессии даты к отдельным датам Я хотел, чтобы он распаковывался только до 1 поле, которое является датой. Я следовал шаг за шагом в решении. Однако, когда я пытаюсь сделать group by с помощью resample, я получаю эту ошибку: ValueError: cannot reindex a non-unique index with a method or limit По какой причине это происходит?

Чтобы быть более понятным, это мой код (индекс исходного кадра данных просто нормальный index 1, 2, 3, ...

df=read_parquet('company_location.parquet')
df=df[['COMPANY','STARTDATE','ENDDATE','LOCATION']]
df['STARTDATE']=pd.to_datetime(df['STARTDATE'])
df['ENDDATE']=pd.to_datetime(df['ENDDATE'])
df=df.dropna(axis=0,how='any')
df['rows']=range(len(df))
starts=df[['COMPANY','STARTDATE','LOCATION','rows']].rename(columns={'STARTDATE':'DATE'})
ends=df[['COMPANY','ENDDATE','LOCATION','rows']].rename(columns={'ENDDATE':'DATE'})
df_decomp=pd.concat([starts,ends])
df_decomp=df_decomp.set_index('rows', append=True)
df_decomp.sort_index()

Все хорошо, пока здесь.

тогда, когда я написал эту строку, возникает ошибка:

df_decomp=df_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))

Ошибка: (это Jupyter Notebook)

ValueError                                Traceback (most recent call last)
/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    688             try:
--> 689                 result = self._python_apply_general(f)
    690             except Exception:

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    706         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707                                                    self.axis)
    708 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    189             group_axes = _get_axes(group)
--> 190             res = f(group)
    191             if not _is_indexed_like(res, group_axes):

<ipython-input-29-e5d0ce53cd1c> in <lambda>(x)
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in fillna(self, method, limit)
    759         """
--> 760         return self._upsample(method, limit=limit)
    761 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
   1072             result = obj.reindex(res_index, method=method,
-> 1073                                  limit=limit, fill_value=fill_value)
   1074 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    196         def wrapper(*args, **kwargs):
--> 197             return func(*args, **kwargs)
    198 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
   3808         kwargs.pop('labels', None)
-> 3809         return super(DataFrame, self).reindex(**kwargs)
   3810 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
   4355         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 4356                                   fill_value, copy).__finalize__(self)
   4357 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   3740             frame = frame._reindex_index(index, method, copy, level,
-> 3741                                          fill_value, limit, tolerance)
   3742 

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   3748                                                 level=level, limit=limit,
-> 3749                                                 tolerance=tolerance)
   3750         return self._reindex_with_indexers({0: [new_index, indexer]},

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
   3137                     if method is not None or limit is not None:
-> 3138                         raise ValueError("cannot reindex a non-unique index "
   3139                                          "with a method or limit")

ValueError: cannot reindex a non-unique index with a method or limit

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-29-e5d0ce53cd1c> in <module>()
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    699 
    700                 with _group_selection_context(self):
--> 701                     return self._python_apply_general(f)
    702 
    703         return result

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    705     def _python_apply_general(self, f):
    706         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 707                                                    self.axis)
    708 
    709         return self._wrap_applied_output(

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    188             # group might be modified
    189             group_axes = _get_axes(group)
--> 190             res = f(group)
    191             if not _is_indexed_like(res, group_axes):
    192                 mutated = True

<ipython-input-29-e5d0ce53cd1c> in <lambda>(x)
----> 1 rep_movement_decomp=rep_movement_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('DATE').resample('D').fillna(method='pad'))

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in fillna(self, method, limit)
    758         2018-01-01 02:00:00  6.0  5
    759         """
--> 760         return self._upsample(method, limit=limit)
    761 
    762     @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs)

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
   1071         else:
   1072             result = obj.reindex(res_index, method=method,
-> 1073                                  limit=limit, fill_value=fill_value)
   1074 
   1075         result = self._apply_loffset(result)

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    195         @wraps(func)
    196         def wrapper(*args, **kwargs):
--> 197             return func(*args, **kwargs)
    198 
    199         if not PY2:

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
   3807         kwargs.pop('axis', None)
   3808         kwargs.pop('labels', None)
-> 3809         return super(DataFrame, self).reindex(**kwargs)
   3810 
   3811     @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs)

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
   4354         # perform the reindex on the axes
   4355         return self._reindex_axes(axes, level, limit, tolerance, method,
-> 4356                                   fill_value, copy).__finalize__(self)
   4357 
   4358     def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value,

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   3739         if index is not None:
   3740             frame = frame._reindex_index(index, method, copy, level,
-> 3741                                          fill_value, limit, tolerance)
   3742 
   3743         return frame

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   3747         new_index, indexer = self.index.reindex(new_index, method=method,
   3748                                                 level=level, limit=limit,
-> 3749                                                 tolerance=tolerance)
   3750         return self._reindex_with_indexers({0: [new_index, indexer]},
   3751                                            copy=copy, fill_value=fill_value,

/usr/local/share/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
   3136                 else:
   3137                     if method is not None or limit is not None:
-> 3138                         raise ValueError("cannot reindex a non-unique index "
   3139                                          "with a method or limit")
   3140                     indexer, missing = self.get_indexer_non_unique(target)

ValueError: cannot reindex a non-unique index with a method or limit

Ответы [ 2 ]

1 голос
/ 11 марта 2020

Я смог подписаться на Pandas: распаковывать диапазон дат до отдельных дат, и я не получил никаких ошибок в вашем наборе данных. Смотрите код ниже в ответе

import pandas as pd

df = pd.DataFrame([['2019-01-15','2019-01-31','A',121.0],
                   ['2019-02-01','2020-03-10','A',136.0],
                   ['2006-10-02','2020-03-10','B',136.0],
                   ['2003-07-31','2020-03-10','B',321.0],
                   ['2010-11-03','2020-03-10','C',322.0],
                   ['2013-02-01','2017-02-07','D',375.0],
                   ['2017-02-08','2019-01-14','D',375.0],
                   ['2019-01-15','2019-04-29','D',375.0],
                   ['2019-04-30','2020-03-10','D',375.0]],
                  columns=['StartDate','EndDate','Company','Location'])
df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df.set_index('Company', inplace=True)
df['row'] = range(len(df))
print(df)

starts = df[['StartDate', 'Location', 'row']].rename(columns={'StartDate': 'Date'})
ends = df[['EndDate', 'Location', 'row']].rename(columns={'EndDate':'Date'})
df_decomp = pd.concat([starts, ends])
df_decomp = df_decomp.set_index('row', append=True)
df_decomp.sort_index()
print(df_decomp)

df_decomp = df_decomp.groupby(level=[0,1]).apply(lambda x: x.set_index('Date').resample('D').fillna(method='pad'))
df_decomp = df_decomp.reset_index(level=1, drop=True)
print(df_decomp.loc['D'])
1 голос
/ 11 марта 2020
    StartDate   EndDate Company Location
0   1/15/2019   1/31/2019   A   121
1   2/1/2019    3/10/2020   A   136
2   10/2/2006   3/10/2020   B   136
3   7/31/2003   3/10/2020   B   321
4   11/3/2010   3/10/2020   C   322
5   2/7/2017    2/7/2017    D   375
6   2/8/2017    1/14/2019   D   375
7   1/15/2019   4/29/2019   D   375
8   4/30/2019   3/10/2020   D   375

Не уверен, что происходит не так, но скопируйте приведенное выше и запустите следующее:

import pandas as pd
df = pd.read_clipboard()

Затем добавьте две строки кода в дополнение к сообщению, которое вы связали:

df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])

Запустите ниже, и он должен работать:

df['StartDate'] = pd.to_datetime(df['StartDate'])
df['EndDate'] = pd.to_datetime(df['EndDate'])
df['row'] = range(len(df))
starts = df[['StartDate', 'Location', 'Company', 'row']].rename(columns={'StartDate': 'date'})
ends = df[['EndDate', 'Location', 'Company', 'row']].rename(columns={'EndDate':'date'})
df_decomp = pd.concat([starts, ends]).drop_duplicates()
df_decomp = df_decomp.set_index('row', append=True)
df_decomp.sort_index()
df_decomp = df_decomp.groupby(level=[0,1]).apply(lambda x: 
x.set_index('date').resample('D').fillna(method='pad'))
df_decomp = df_decomp.reset_index(level=1, drop=True)
df_decomp
...