Одна идея состоит в том, чтобы факторизовать через np.select
, а затем использовать пользовательский цикл через numba
:
from numba import njit
df = pd.DataFrame({'Vehicle_ID': ['A']*18,
'Position': ['START', 'MID', 'MID', 'END', 'MID', 'START']*3})
@njit
def grouper(pos):
res = np.empty(pos.shape)
num = 1
started = 0
for i in range(len(res)):
current_pos = pos[i]
if (started == 0) and (current_pos == 0):
started = 1
res[i] = num
elif (started == 1) and (current_pos == 1):
started = 0
res[i] = num
num += 1
elif (started == 1) and (current_pos in [-1, 0]):
res[i] = num
else:
res[i] = 0
return res
arr = np.select([df['Position'].eq('START'), df['Position'].eq('END')], [0, 1], -1)
df['Group'] = grouper(arr).astype(int)
Результат:
print(df)
Position Vehicle_ID Group
0 START A 1
1 MID A 1
2 MID A 1
3 END A 1
4 MID A 0
5 START A 2
6 START A 2
7 MID A 2
8 MID A 2
9 END A 2
10 MID A 0
11 START A 3
12 START A 3
13 MID A 3
14 MID A 3
15 END A 3
16 MID A 0
17 START A 4
По моему мнению, вы должны , а не включать "пустые" значения, так как это приведет к тому, что ваша серия будет иметь тип object
d, неэффективный для любой последующей обработки. Как и выше, вы можете использовать 0
вместо.
Сравнительный анализ производительности
numba
примерно в 10 раз быстрее, чем один чистый подход панд: -
import pandas as pd, numpy as np
from numba import njit
df = pd.DataFrame({'Vehicle_ID': ['A']*18,
'Position': ['START', 'MID', 'MID', 'END', 'MID', 'START']*3})
df = pd.concat([df]*10, ignore_index=True)
assert joz(df.copy()).equals(jpp(df.copy()))
%timeit joz(df.copy()) # 18.6 ms per loop
%timeit jpp(df.copy()) # 1.95 ms per loop
Функции бенчмаркинга:
def joz(df):
# identification of sequences
df['Position_Prev'] = df['Position'].shift(1)
df['Sequence'] = 0
df.loc[(df['Position'] == 'START') & (df['Position_Prev'] != 'START'), 'Sequence'] = 1
df.loc[df['Position'] == 'END', 'Sequence'] = -1
df['Sequence_Sum'] = df['Sequence'].cumsum()
df.loc[df['Sequence'] == -1, 'Sequence_Sum'] = 1
# take only items between START and END and generate Group number
df2 = df[df['Sequence_Sum'] == 1].copy()
df2.loc[df['Sequence'] == -1, 'Sequence'] = 0
df2['Group'] = df2['Sequence'].cumsum()
# merge results to one dataframe
df = df.merge(df2[['Group']], left_index=True, right_index=True, how='left')
df['Group'] = df['Group'].fillna(0)
df['Group'] = df['Group'].astype(int)
df.drop(['Position_Prev', 'Sequence', 'Sequence_Sum'], axis=1, inplace=True)
return df
@njit
def grouper(pos):
res = np.empty(pos.shape)
num = 1
started = 0
for i in range(len(res)):
current_pos = pos[i]
if (started == 0) and (current_pos == 0):
started = 1
res[i] = num
elif (started == 1) and (current_pos == 1):
started = 0
res[i] = num
num += 1
elif (started == 1) and (current_pos in [-1, 0]):
res[i] = num
else:
res[i] = 0
return res
def jpp(df):
arr = np.select([df['Position'].eq('START'), df['Position'].eq('END')], [0, 1], -1)
df['Group'] = grouper(arr).astype(int)
return df