Как упомянул @Parth в комментариях, сначала вам нужно иметь набор данных, подходящий для такого стратифицированного разделения. Затем вы можете создать новый столбец с комбинацией «Customer» и «item_name» для подачи аргумента «stratify» метода «train_test_split», который является частью sklearn.
Ниже вы можете найти пример.
import pandas as pd
from sklearn.model_selection import train_test_split
#Create sample data
data = {
"Customer":["A", "A", "A", "A","A","A","A","A","A", "B", "B", "B","B", "B", "B", "B","B","B"],
"Orderid":[1, 1, 1, 2, 2, 2, 2, 3, 2, 1, 2, 1, 1, 1, 1, 2, 2, 2],
"item_name":[
"orange",
"apple",
"orange",
"apple",
"orange",
"apple",
"orange",
"apple",
"orange",
"apple",
"orange",
"apple",
"orange",
"apple",
"orange",
"apple",
"orange",
"apple"
]
}
# Convert data to dataframe
df = pd.DataFrame(data)
# Create a new column with combination of "Customer" and "item_name" to feed the "stratify" parameter
# train_test_split method which is a part of "sklearn.model_selection"
df["CustAndItem"] = df["Customer"]+"_"+df["item_name"]
# First split the "train" and "test" set. In this example I have split %40 of the data as "test"
# and %60 of data as "train"
X_train, X_test, y_train, y_test = train_test_split(df.index,
df["CustAndItem"],
test_size=0.4,
stratify=df["CustAndItem"])
# Get actual data after split operation
df_train = df.loc[X_train].copy(True)
df_test = df.loc[X_test].copy(True)
# Now split "test" set to "validation" and "test" sets. In this example I have split them equally
# (test_size = 0.5) which will contain %20 of the main set.
X_validate, X_test, y_validate, y_test = train_test_split(df_test.index,
df_test["CustAndItem"],
test_size= 0.5,
stratify=df_test["CustAndItem"])
# Get actual data after split
df_validate = df_test.loc[X_validate]
df_test = df_test.loc[X_test]
# Print results
print(df_train)
print(df_validate)
print(df_test)