Разделите срезом, а затем ffill ().
df['group'] = df[::3]
df['group'].ffill(inplace=True)
Теперь вы можете группировать и отбрасывать слишком малые группы.
# df has a RangeIndex, so we get to slice
group_size = 3
df = pd.DataFrame({'a':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) # data frame has been sorted before this point and the rows are in the correct order
slices = df[::group_size]
# but you don't want the group number to be the ordinal at the slices
# so make a copy of the slice to assign good group numbers to it (or get a chained assignment warning)
slices=slices.copy()
slices['group'] = [i for i in range(len(slices))]
df['group'] = slices['group']
# ffill with the nice group numbers
df['group'].ffill(inplace=True)
#now trim the last group
last_group = df['group'].max()
if len(df[df['group']==last_group]) < group_size:
df = df[df['group'] != last_group]
print(df)
Раз:
import pandas as pd
from datetime import datetime as dt
print(pd.__version__)
def test1():
df = pd.DataFrame({'a':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) # data frame has been sorted before this point and the rows are in the correct order
#print(df)
group_size = 3
numbers = list(range(len(df.index) // group_size)) * group_size
numbers.sort()
numbers = pd.Series(numbers)
df = pd.concat([df, numbers], ignore_index=True, axis=1)
df.columns = ['value', 'group number']
groups = df.groupby('group number').filter(lambda x: len(x) == group_size)
#print(groups)
def test2():
# Won't work well because there is no easy way to calculate the remainder that should
# not be grouped. But cut() is good for discretizing continuous values
df = pd.DataFrame({'a':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) # data frame has been sorted before this point and the rows are in the correct order
num_groups = len(df.index)/3
df['group'] = pd.cut(df['a'], num_groups, right=False)
#print(df)
def test3():
# df has a RangeIndex, so we get to slice
df = pd.DataFrame({'a':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) # data frame has been sorted before this point and the rows are in the correct order
df['group'] = df[::3]
df['group'].ffill(inplace=True)
#print(df['group'])
def test4():
# A mask can also be used
df = pd.DataFrame({'a':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) # data frame has been sorted before this point and the rows are in the correct order
df['group'] = df[df.index % 3 == 0]
df['group'].ffill(inplace=True)
#print(df)
def test5():
# maybe go after grouping with iloc
df = pd.DataFrame({'a':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}) # data frame has been sorted before this point and the rows are in the correct order
group = 0
for i in range(0,len(df), 3):
df.loc[i:i+3, 'group'] = group
group+=1
#print(df)
funcs = [test1, test2, test3, test4, test5]
for func in funcs:
print(func.__name__)
a = dt.now()
for i in range(1000):
func()
b = dt.now()
print(b-a)