K-кратное кодирование цели в Spark - PullRequest
0 голосов
/ 01 апреля 2020

Я надеялся воспроизвести следующую python k-кратную целевую стратегию кодирования в чистом Spark:

import pandas as pd
import  category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
import  numpy as np

dataset = pd.read_csv("dataset.csv")
cols_to_encode = ["cat_1", "cat_2"]

X_train, X_test, y_train, y_test = train_test_split(dataset[cols_to_encode], dataset["target"], test_size=0.25, random_state=0)

X_train = X_train.reset_index(drop=True)     
y_train = y_train.reset_index(drop=True)

encoder = ce.WOEEncoder(cols=cols_to_encode)

X_train_fitted = pd.DataFrame([], columns = X_train.columns)

kf = KFold(n_splits = 5, shuffle = False, random_state=0)
for tr_ind, val_ind in kf.split(X_train, y_train):
    encoder.fit(X_train.loc[tr_ind], y_train.loc[tr_ind])
    X_train_fitted = X_train_fitted.append(encoder.transform(X_train.loc[val_ind]))

encoder.fit(X_train, y_train)
X_test_fitted = encoder.transform(X_test)

C = np.logspace(0, 4, num = 10)
penalty = ['l1', 'l2', 'elasticnet', 'none']
solver = ['liblinear', 'saga', "sag", "lbfgs"]

params = dict(C=C, penalty=penalty, solver=solver)

param_comb = 10

lr = LogisticRegression(random_state=0)
#preserving same cv
random_search = RandomizedSearchCV(lr, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=2, verbose=3, random_state=0, cv = kf.split(X_train, y_train))
random_search.fit(X_train_fitted, y_train)
print(random_search.best_score_)

Поскольку я не нашел никакой текущей реализации, я пытаюсь написать свою собственную , До сих пор для стратегии среднего кодирования все, что я мог придумать, это использовать группу в обучающем наборе, создавая фрейм данных, к которому я мог бы присоединиться с набором проверки. Этот подход выглядит довольно ручным и его трудно воспроизвести для более сложных кодировок. Я ожидаю воспроизвести его более искровым образом, точно так же, как кодер работал в приведенном выше коде python.

Редактировать: Попытка кода в Spark:

import spark.implicits._
import org.apache.spark.ml.Pipeline

  val simpleData = Seq(("James","Sales","NY",90000,34,10000),
    ("Michael","Sales","NY",86000,56,20000),
    ("Robert","Sales","CA",81000,30,23000),
    ("Maria","Finance","CA",90000,24,23000),
    ("Raman","Finance","CA",99000,40,24000),
    ("Scott","Finance","NY",83000,36,19000),
    ("Jen","Finance","NY",79000,53,15000),
    ("Jeff","Marketing","CA",80000,25,18000),
    ("Kumar","Marketing","NY",91000,50,21000)
  )
  val df = simpleData.toDF("employee_name","department","salary","state","age","bonus")
  df.show()

val splitDF = df.randomSplit(Array(1,1,1,1,1)) 
val (df1,df2,df3,df4,df5) = (splitDF(0),splitDF(1),splitDF(2),splitDF(3),splitDF(4))

val df1_encoded_train = df2.union(df3).union(df4).union(df5).groupBy("department").mean("bonus")
val df1_encoded_val = df1.join(df1_encoded_train, Seq("department"), "left")    

val df2_encoded_train = df1.union(df3).union(df4).union(df5).groupBy("department").mean("bonus")
val df2_encoded_val = df2.join(df2_encoded_train, Seq("department"), "left")    

val df3_encoded_train = df1.union(df2).union(df4).union(df5).groupBy("department").mean("bonus")
val df3_encoded_val = df3.join(df3_encoded_train, Seq("department"), "left")    

val df4_encoded_train = df1.union(df2).union(df3).union(df5).groupBy("department").mean("bonus")
val df4_encoded_val = df4.join(df4_encoded_train, Seq("department"), "left")    

val df5_encoded_train = df1.union(df2).union(df3).union(df4).groupBy("department").mean("bonus")
val df5_encoded_val = df5.join(d5_encoded_train, Seq("department"), "left")    

val df_encoded = df1_encoded_val.union(df2_encoded_val).union(df3_encoded_val).union(df4_encoded_val).union(df5_encoded_val)
...