При попытке объединить в индекс времени данных два кадра данных, я получаю KeyError: DatetimeIndex
. Я попытался объединиться с дубликатами и без в левом фрейме данных и все еще получаю ту же ошибку. Что не так?
print(news_df_sep.info())
DatetimeIndex: 55332 entries, 2020-01-02 13:00:00.824000+00:00 to 2020-04-15 02:43:28.876000+00:00
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 stock_id 55332 non-null int64
1 headlines 55332 non-null object
print(y_df.info())
DatetimeIndex: 7591265 entries, 2011-02-24 09:30:00.070041845 to 2020-04-02 14:59:59.938843711
Data columns (total 2 columns):
# Column Dtype
--- ------ -----
0 stock_id int64
1 BestPlayY2 float64
print(type(news_df_sep.index), type(y_df.index))
(pandas.core.indexes.datetimes.DatetimeIndex,
pandas.core.indexes.datetimes.DatetimeIndex)
print(sum(news_df_sep.index.duplicated()), sum(y_df.index.duplicated())
(182, 0)
df_t = news_df_sep.drop_duplicates()
print(df_t.info())
DatetimeIndex: 54584 entries, 2020-01-02 13:00:00.824000+00:00 to 2020-04-15 02:43:28.876000+00:00
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 stock_id 54584 non-null int64
1 headlines 54584 non-null object
Слияние:
df = pd.merge_asof(df_t, y_df, on=df_t.index, by='stock_id')
Результат:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-323-cd5352c971ef> in <module>
----> 1 df = pd.merge_asof(df_t, y_df, on=df_t.index, by='stock_id')
~/anaconda3/envs/py37jp/lib/python3.7/site-packages/pandas/core/reshape/merge.py in merge_asof(left, right, on, left_on, right_on, left_index, right_index, by, left_by, right_by, suffixes, tolerance, allow_exact_matches, direction)
537 tolerance=tolerance,
538 allow_exact_matches=allow_exact_matches,
--> 539 direction=direction,
540 )
541 return op.get_result()
~/anaconda3/envs/py37jp/lib/python3.7/site-packages/pandas/core/reshape/merge.py in __init__(self, left, right, on, left_on, right_on, left_index, right_index, by, left_by, right_by, axis, suffixes, copy, fill_method, how, tolerance, allow_exact_matches, direction)
1565 how=how,
1566 suffixes=suffixes,
-> 1567 fill_method=fill_method,
1568 )
1569
~/anaconda3/envs/py37jp/lib/python3.7/site-packages/pandas/core/reshape/merge.py in __init__(self, left, right, on, left_on, right_on, left_index, right_index, axis, suffixes, copy, fill_method, how)
1455 how=how,
1456 suffixes=suffixes,
-> 1457 sort=True, # factorize sorts
1458 )
1459
~/anaconda3/envs/py37jp/lib/python3.7/site-packages/pandas/core/reshape/merge.py in __init__(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, copy, indicator, validate)
625 self.right_join_keys,
626 self.join_names,
--> 627 ) = self._get_merge_keys()
628
629 # validate the merge keys dtypes. We may need to coerce
~/anaconda3/envs/py37jp/lib/python3.7/site-packages/pandas/core/reshape/merge.py in _get_merge_keys(self)
1622
1623 # note this function has side effects
-> 1624 (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()
1625
1626 # validate index types are the same
~/anaconda3/envs/py37jp/lib/python3.7/site-packages/pandas/core/reshape/merge.py in _get_merge_keys(self)
972 else:
973 if rk is not None:
--> 974 right_keys.append(right._get_label_or_level_values(rk))
975 join_names.append(rk)
976 else:
~/anaconda3/envs/py37jp/lib/python3.7/site-packages/pandas/core/generic.py in _get_label_or_level_values(self, key, axis)
1690 values = self.axes[axis].get_level_values(key)._values
1691 else:
-> 1692 raise KeyError(key)
1693
1694 # Check for duplicates
KeyError: DatetimeIndex(['2020-01-02 13:00:00.824000+00:00',
'2020-01-08 11:02:52.833000+00:00',
'2020-01-10 03:41:18.858000+00:00',
'2020-01-16 13:00:01.404000+00:00',
'2020-01-22 13:00:01.560000+00:00',
'2020-01-23 13:00:01.493000+00:00',
'2020-01-27 14:38:26.199000+00:00',
'2020-01-27 19:00:00.580000+00:00',
'2020-01-28 21:30:02.279000+00:00',
'2020-01-29 14:33:16.401000+00:00',
...
'2020-03-24 20:13:27.745000+00:00',
'2020-03-24 20:13:39.192000+00:00',
'2020-03-24 20:14:05.710000+00:00',
'2020-03-24 22:42:35.011000+00:00',
'2020-03-25 21:20:00.820000+00:00',
'2020-03-25 21:20:05.833000+00:00',
'2020-03-25 21:20:47.132000+00:00',
'2020-04-06 18:58:34.410000+00:00',
'2020-04-13 15:40:07.672000+00:00',
'2020-04-15 02:43:28.876000+00:00'],
dtype='datetime64[ns, UTC]', name='date', length=54584, freq=None)