Я надеялся воспроизвести следующую python k-кратную целевую стратегию кодирования в чистом Spark:
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np
dataset = pd.read_csv("dataset.csv")
cols_to_encode = ["cat_1", "cat_2"]
X_train, X_test, y_train, y_test = train_test_split(dataset[cols_to_encode], dataset["target"], test_size=0.25, random_state=0)
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
encoder = ce.WOEEncoder(cols=cols_to_encode)
X_train_fitted = pd.DataFrame([], columns = X_train.columns)
kf = KFold(n_splits = 5, shuffle = False, random_state=0)
for tr_ind, val_ind in kf.split(X_train, y_train):
encoder.fit(X_train.loc[tr_ind], y_train.loc[tr_ind])
X_train_fitted = X_train_fitted.append(encoder.transform(X_train.loc[val_ind]))
encoder.fit(X_train, y_train)
X_test_fitted = encoder.transform(X_test)
C = np.logspace(0, 4, num = 10)
penalty = ['l1', 'l2', 'elasticnet', 'none']
solver = ['liblinear', 'saga', "sag", "lbfgs"]
params = dict(C=C, penalty=penalty, solver=solver)
param_comb = 10
lr = LogisticRegression(random_state=0)
#preserving same cv
random_search = RandomizedSearchCV(lr, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=2, verbose=3, random_state=0, cv = kf.split(X_train, y_train))
random_search.fit(X_train_fitted, y_train)
print(random_search.best_score_)
Поскольку я не нашел никакой текущей реализации, я пытаюсь написать свою собственную , До сих пор для стратегии среднего кодирования все, что я мог придумать, это использовать группу в обучающем наборе, создавая фрейм данных, к которому я мог бы присоединиться с набором проверки. Этот подход выглядит довольно ручным и его трудно воспроизвести для более сложных кодировок. Я ожидаю воспроизвести его более искровым образом, точно так же, как кодер работал в приведенном выше коде python.
Редактировать: Попытка кода в Spark:
import spark.implicits._
import org.apache.spark.ml.Pipeline
val simpleData = Seq(("James","Sales","NY",90000,34,10000),
("Michael","Sales","NY",86000,56,20000),
("Robert","Sales","CA",81000,30,23000),
("Maria","Finance","CA",90000,24,23000),
("Raman","Finance","CA",99000,40,24000),
("Scott","Finance","NY",83000,36,19000),
("Jen","Finance","NY",79000,53,15000),
("Jeff","Marketing","CA",80000,25,18000),
("Kumar","Marketing","NY",91000,50,21000)
)
val df = simpleData.toDF("employee_name","department","salary","state","age","bonus")
df.show()
val splitDF = df.randomSplit(Array(1,1,1,1,1))
val (df1,df2,df3,df4,df5) = (splitDF(0),splitDF(1),splitDF(2),splitDF(3),splitDF(4))
val df1_encoded_train = df2.union(df3).union(df4).union(df5).groupBy("department").mean("bonus")
val df1_encoded_val = df1.join(df1_encoded_train, Seq("department"), "left")
val df2_encoded_train = df1.union(df3).union(df4).union(df5).groupBy("department").mean("bonus")
val df2_encoded_val = df2.join(df2_encoded_train, Seq("department"), "left")
val df3_encoded_train = df1.union(df2).union(df4).union(df5).groupBy("department").mean("bonus")
val df3_encoded_val = df3.join(df3_encoded_train, Seq("department"), "left")
val df4_encoded_train = df1.union(df2).union(df3).union(df5).groupBy("department").mean("bonus")
val df4_encoded_val = df4.join(df4_encoded_train, Seq("department"), "left")
val df5_encoded_train = df1.union(df2).union(df3).union(df4).groupBy("department").mean("bonus")
val df5_encoded_val = df5.join(d5_encoded_train, Seq("department"), "left")
val df_encoded = df1_encoded_val.union(df2_encoded_val).union(df3_encoded_val).union(df4_encoded_val).union(df5_encoded_val)