Используя решение, предложенное @wwnde, я нашел тот, который лучше масштабируется для моего реального набора данных:
import numpy as np
import pandas as pd
df1 = DataFrame({
'id': ['a']*4,
'date': ['02-02-2015']*4,
'time_1': ['08:00:00', '09:00:00', '10:30:00', '12:45:00']
})
df2 = DataFrame({
'id': ['a']*7,
'date': ['02-02-2015',
'02-02-2015',
'03-02-2015', # small change here relatively to the df un my first post
'02-02-2015',
'02-02-2015',
'02-02-2015',
'02-02-2015'],
'time_2': ['08:00:00', '08:09:00', '08:04:01','08:52:36', '09:34:25', '10:30:00', '11:23:38']
})
----------------------------------------------------
def preproDf(df1, df2, time_1, time_2, _id, date):
'''
Preprocess the dataframes for the following operations
df1: pd.DataFrame, left dataframe
df2: pd.DataFrame, right dataframe
time_1:str, name of the left dataframe
time_2:str, name of the right dataframe
_id:str, name of the id variable. Should be the same for both dataframes
date:str, name of the date variable. Should be the same for both dataframes
return: None
'''
df2[time_2] = df2[time_2].apply(pd.to_datetime)
df1[time_1] = df1[time_1].apply(pd.to_datetime)
#sort to allow merge_asof
df1=df1.sort_values([_id, date, time_1])
df2=df2.sort_values([_id, date, time_2])
def processDF(df1, df2, time_1, time_2, _id, date):
# initialisation
groupKeys = list(df2.groupby([_id, date]).groups.keys())
dfGroup=groupKeys[0]
group = df2.groupby([_id, date]).get_group(dfGroup)
rslt = pd.merge_asof(group, df1, left_on=time_2, right_on=time_1, by=[_id, date], tolerance=pd.Timedelta('2H'),allow_exact_matches=True,direction='backward')#.dropna()
# For loop to get the values in an array
for group in groupKeys[1:]: # Iteration start at the second elmt
group = df2.groupby([_id, date]).get_group(group)
item = pd.merge_asof(group, df1, left_on=time_2, right_on=time_1, by=[_id, date], tolerance=pd.Timedelta('2H'),allow_exact_matches=True,direction='backward')#.dropna()
rslt = np.vstack((rslt, item))
rslt = DataFrame(rslt, columns=item.columns)
# Creating timeDifference variable
rslt['timeDifference'] = rslt[time_2] - rslt[time_1]
# Getting the actual result
rslt = rslt.groupby([_id, date, time_1]).timeDifference.max()
rslt = pd.DataFrame(rslt).reset_index()
rslt.rename({time_1: 'openTime'}, axis='columns')
return rslt
Результат:
preproDf(df1, df2, 'time_1', 'time_2', 'id', 'date')
processDF(df1, df2, 'time_1', 'time_2', 'id', 'date')
id date time_1 screenOnDuration
0 a 02-02-2015 2020-05-29 08:00:00 00:52:36
1 a 02-02-2015 2020-05-29 09:00:00 00:34:25
2 a 02-02-2015 2020-05-29 10:30:00 00:53:38