У нас может быть что-то вроде этого
import pandas as pd
data=pd.DataFrame(
[
[1,'15/05/16 13:51','15/05/16 14:06'],
[1,'15/05/16 14:06','15/05/16 14:32'],
[1,'15/05/16 14:32','15/05/16 14:34'],
[2,'15/05/16 11:14','15/05/16 11:25'],
[2,'15/05/16 11:25','15/05/16 12:09'],
[2,'15/05/16 12:14','15/05/16 12:42'],
[2,'15/05/16 17:33','15/05/16 17:41'],
[2,'15/05/16 17:41','15/05/16 18:27']
]
,columns=['userid','start','end']
)
from datetime import datetime
data['start']=data['start'].map(lambda x: datetime.strptime(x,'%d/%m/%y %H:%M'))
data['end']=data['end'].map(lambda x: datetime.strptime(x,'%d/%m/%y %H:%M'))
diffData=[]
for i in range(1, len(data)):
diffData.append((data.loc[i,'start'] - data.loc[i-1,'end']).seconds / 60)
data['diff']=[0] + diffData
def getStartEnd(tempData,THRESHOLD):
tempData=tempData.reset_index()
finalData=[]
startTime=tempData.loc[0,'start']
for i in range(1,len(tempData)):
if(tempData.loc[i,'diff'] > THRESHOLD):
finalData.append([tempData.loc[i,'userid'],startTime,tempData.loc[i-1,'end']])
startTime=tempData.loc[i,'start']
finalData.append([tempData.loc[i,'userid'],startTime,tempData.loc[i,'end']])
return(pd.DataFrame(finalData,columns=['userid','start','end']))
finalData=pd.DataFrame(columns=['userid','start','end'])
for user in data['userid'].unique():
finalData=pd.concat([finalData,getStartEnd(data[data['userid']==user],60)])
print(finalData)
userid start end
0 1 2016-05-15 13:51:00 2016-05-15 14:34:00
0 2 2016-05-15 11:14:00 2016-05-15 12:42:00
1 2 2016-05-15 17:33:00 2016-05-15 18:27:00