Интеграция Hyperopt со Spark MlLib - PullRequest
1 голос
/ 13 февраля 2020

У кого-нибудь есть хороший пример интеграции Hyperopt в MlLib Spark? Я пытался сделать это на Databricks и продолжаю получать ту же ошибку. Я не уверен, является ли это проблемой с моей целевой функцией или вместо этого, если это что-то с Spark ML на pyspark и как он подключается к Databricks.

import itertools
from pyspark.sql import functions as f
from pyspark.sql import DataFrame
from pyspark.sql.types import *

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import OneHotEncoder, Imputer, VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.classification import GBTClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import numpy as np
from itertools import product
from hyperopt import fmin, hp, tpe, STATUS_OK, SparkTrials
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

search_space ={'maxDepth'   : hp.choice("maxDepth", np.arange(3, 8, dtype=int)),
        'maxIter'       : hp.uniform("maxIter", 200,800),
        'featureSubsetStrategy' : str(hp.quniform("featureSubsetStrategy", .5,1,.1)),
        'minInstancesPerNode' : hp.uniform("min_child_weight", 1,10),
        'stepSize'    : hp.loguniform('stepSize', np.log(0.01), np.log(0.1)),
        'subsamplingRate'    : hp.quniform("featureSubsetStrategy", .5,1,.1)   
    }
evaluator = BinaryClassificationEvaluator(labelCol="positive")

def train(params):
  gbtModel = GBTClassifier(labelCol="positive", featuresCol="features").fit(train)
  predictions_val = gbtModel.predict(val.map(lambda x: x.features))
  labelsAndPredictions = val.map(lambda lp: lp.label).zip(predictions_val)
  ROC = evaluator.evaluate(predictions_val, {evaluator.metricName: "areaUnderROC"})

  return {'ROC': ROC, 'status': STATUS_OK}



N_HYPEROPT_PROBES = 1000 #can increase, keep small for testing
EARLY_STOPPING = 50
HYPEROPT_ALGO = tpe.suggest
NB_CV_FOLDS = 5 # for testing, can increase

obj_call_count = 0
cur_best_score = 1000000
spark_trials = SparkTrials(parallelism=4)
best = fmin(fn=train,
             space=search_space,
              algo=HYPEROPT_ALGO,
                     max_evals=N_HYPEROPT_PROBES,
                     trials=spark_trials,
                     verbose=1) 

После этого я получаю следующую ошибку:

Total Trials: 0: 0 succeeded, 0 failed, 0 cancelled. py4j.Py4JException: Method __getstate__([]) does not exist

...