Вы можете использовать resample , а затем reindex в индексе временных рядов, чтобы сделать то, что вы хотите:
Resample позволяет изменять частоту индекса даты и времени,В этом случае вы хотите «увеличить выборку» - увеличить количество шагов в ваших данных. Reindex затем позволяет заполнять пробелы с помощью NA
import pandas as pd
from datetime import datetime, timedelta
import math
def main(input_file="untitled.txt", minutes_per_segment=5):
df = pd.read_csv(input_file
,parse_dates=['Start','Finish']
,names=['Index', 'Task', 'Start', 'Finish']
,index_col='Index'
,header=0
)
# Find the duration of each task.
df['Start'] = pd.to_datetime(df['Start'], dayfirst=True, errors='coerce')
df['Finish'] = pd.to_datetime(df['Finish'], dayfirst=True, errors='coerce')
# Get the number of <segments> minute segments that the task
# runs for, rounded up to the next integer value
df['Segments'] = (df.apply(lambda x: math.ceil((x.Finish -
x.Start).total_seconds()/60/minutes_per_segment),
axis='columns'))
# You can skip this step if the values in your Task_Name are unique
# if not, you need something so you can treat each entry independently
df['Task_ID'] = df.index.astype(str)
df['Task_Name'] = df.apply(lambda x: '_'.join([x.Task, x.Task_ID]), axis=1)
# create a new df so that the start and end times are in separate rows
df2 = pd.concat([df[['Task_Name','Start', 'Segments']]
.rename(columns={'Start':'Time'}),
df[['Task_Name','Finish', 'Segments']]
.rename(columns={'Finish':'Time'})])
df2 = df2.sort_values(by='Task_Name').set_index('Time')
df2.index = pd.DatetimeIndex(df2.index)
# group by the task name
# resample to create 5-minute blocks
# clean up columns
df3 = (df2.groupby('Task_Name')
.apply(lambda x: x.resample(rule='{interval}T'.format(interval=minutes_per_segment),
label='right',
closed='right')
.asfreq()
.ffill()
)
.reset_index(level=1)
.rename(columns={'level_1':'Time'})
.reset_index(drop=True))
# reset the index as a datetime Index - needed to do the next reindex step
df3.set_index('Time', inplace=True)
df3.index = pd.DatetimeIndex(df3.index)
# group by the time and aggregate the data:
# count the number of tasks in the time group
# (optional) create a list of the task names (you can comment out this line, and the name in the 'reorder' step at the bottom, if you don't need this)
# reindex to get all the 5-minute segments in the date range
df4 = (df3.reset_index()
.groupby('Time')
.agg({'Task_Name': {'Tasks_Running': 'count',
'Task_Names': lambda x: list(x) # you can get rid of this line if you prefer
}
})
.reindex(pd.date_range(start=df3.index.min(),
end=df3.index.max(),
freq='{segments}min'.format(segments=minutes_per_segment)))
)
# remove the multi-index created in the agg step
df4.columns = [name[1] for name in df4.columns]
df4.index.name = 'Start_Time'
df4.reset_index(inplace=True)
# Fill in the missing task count (any time periods newly added by the reindex will have 0 tasks)
df4.Tasks_Running.fillna(0, inplace=True)
# get the end time from the start time column
df4['End_Time'] = df4.Start_Time.shift(-1).ffill()
# reorder the columns for ease of reading
df4 = df4[['Start_Time','End_Time','Tasks_Running', 'Task_Names']] # comment this out if you commented out the line in the df4 agg
df4.index.name = 'Timeslot'
df4.reset_index(inplace=True)
return df4
if __name__ == "__main__":
main()
Это дает вам:
Timeslot Start_Time End_Time Tasks_Running Task_Names
0 0 2018-10-15 13:30:00 2018-10-15 13:35:00 1.0 [RandomName0]
1 1 2018-10-15 13:35:00 2018-10-15 13:40:00 1.0 [RandomName0]
2 2 2018-10-15 13:40:00 2018-10-15 13:45:00 2.0 [RandomName0, RandomName1]
3 3 2018-10-15 13:45:00 2018-10-15 13:50:00 2.0 [RandomName0, RandomName1]
4 4 2018-10-15 13:50:00 2018-10-15 13:55:00 2.0 [RandomName1, RandomName2]
5 5 2018-10-15 13:55:00 2018-10-15 14:00:00 2.0 [RandomName1, RandomName2]
6 6 2018-10-15 14:00:00 2018-10-15 14:05:00 0.0 NaN
7 7 2018-10-15 14:05:00 2018-10-15 14:10:00 0.0 NaN
8 8 2018-10-15 14:10:00 2018-10-15 14:15:00 1.0 [RandomName3]
9 9 2018-10-15 14:15:00 2018-10-15 14:20:00 1.0 [RandomName3]
10 10 2018-10-15 14:20:00 2018-10-15 14:25:00 1.0 [RandomName4]
11 11 2018-10-15 14:25:00 2018-10-15 14:30:00 1.0 [RandomName4]
12 12 2018-10-15 14:30:00 2018-10-15 14:35:00 1.0 [RandomName5]
13 13 2018-10-15 14:35:00 2018-10-15 14:35:00 1.0 [RandomName5]