Глубокое изучение конвейера Spark: ошибка: неверный аргумент, а не строка или столбец - PullRequest
0 голосов
/ 15 октября 2019

Я применяю глубокое обучение, используя конвейер pyspark. Последнее, но не менее важное:

TypeError: Invalid argument, not a string or column: 
0.5555167887095438 of type <class 'float'>. 
For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

Я получил вышеуказанную ошибку. Как я могу преодолеть эту проблему? Код:

df_transform_fin = pipeline_model.select('features','label_index')
df_transform_fin.show()

+--------------------+-----------+
|            features|label_index|
+--------------------+-----------+
|(54,[0,1,2,3,4,5,...|        1.0|
|(54,[0,1,2,3,4,5,...|        1.0|
|(54,[0,1,2,3,10,1...|        1.0|
|(54,[0,1,2,3,4,5,...|        1.0|
|(54,[0,1,2,3,10,1...|        1.0|
|(54,[0,1,2,3,10,1...|        1.0|
|(54,[0,1,2,3,4,5,...|        1.0|
|(54,[0,1,2,3,10,1...|        1.0|
|(54,[0,1,2,3,10,1...|        1.0|
|(54,[0,1,2,3,10,1...|        1.0|
|(54,[0,1,2,3,10,1...|        1.0|
|(54,[0,1,2,3,10,1...|        1.0|
|(54,[0,1,2,4,6,7,...|        1.0|
|(54,[0,1,2,3,4,5,...|        1.0|
|(54,[0,1,2,3,4,5,...|        1.0|
|(54,[0,1,2,4,6,7,...|        1.0|
|(54,[0,1,2,3,4,5,...|        1.0|
|(54,[0,1,2,3,4,5,...|        1.0|
|(54,[0,1,2,3,4,5,...|        1.0|
|(54,[0,1,2,3,4,5,...|        1.0|
+--------------------+-----------+
only showing top 20 rows

df_transform_fin.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label_index: double (nullable = false)


def dl_pipeline_fit_score_results(dl_pipeline=dl_pipeline,
                                  train_data=train_data,
                                  test_data=test_data,
                                  label='label_index'):

    fit_dl_pipeline = dl_pipeline.fit(train_data)
    pred_train = fit_dl_pipeline.transform(train_data)
    pred_test = fit_dl_pipeline.transform(test_data)

    pnl_train = pred_train.select(label, "prediction")
    pnl_test = pred_test.select(label, "prediction")

    pred_and_label_train = pnl_train.rdd.map(lambda row: (row[label], row['prediction']))
    pred_and_label_test = pnl_test.rdd.map(lambda row: (row[label], row['prediction']))

    metrics_train = MulticlassMetrics(pred_and_label_train)
    metrics_test = MulticlassMetrics(pred_and_label_test)

    print("Training Data Accuracy: {}".format(round(metrics_train.precision(),4)))
    print("Training Data Confusion Matrix")
    display(pnl_train.crosstab('label_index', 'prediction').toPandas())

    print("\nTest Data Accuracy: {}".format(round(metrics_test.precision(),4)))
    print("Test Data Confusion Matrix")
    display(pnl_test.crosstab('label_index', 'prediction').toPandas())

**

dl_pipeline_fit_score_results(dl_pipeline=dl_pipeline,
                              train_data=train_data,
                              test_data=test_data,
                              label='label_index');

>>> Fit model
>>> Synchronous training complete.
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-30-cb5b6595cf8f> in <module>()
      2                               train_data=train_data,
      3                               test_data=test_data,
----> 4                               label='label_index');
2 frames
<ipython-input-29-4451e26ef03d> in dl_pipeline_fit_score_results(dl_pipeline, train_data, test_data, label)
     18     metrics_test = MulticlassMetrics(pred_and_label_test)
     19 
---> 20     print("Training Data Accuracy: {}".format(round(metrics_train.precision(),4)))
     21     print("Training Data Confusion Matrix")
     22     display(pnl_train.crosstab('label_index', 'prediction').toPandas())
/content/drive/spark+keras/spark-2.4.4-bin-hadoop2.7/python/pyspark/sql/functions.py in round(col, scale)
    600     """
    601     sc = SparkContext._active_spark_context
--> 602     return Column(sc._jvm.functions.round(_to_java_column(col), scale))
    603 
    604 
/content/drive/spark+keras/spark-2.4.4-bin-hadoop2.7/python/pyspark/sql/column.py in _to_java_column(col)
     51             "{0} of type {1}. "
     52             "For column literals, use 'lit', 'array', 'struct' or 'create_map' "
---> 53             "function.".format(col, type(col)))
     54     return jcol
     55 
TypeError: Invalid argument, not a string or column: 0.5555167887095438 of type <class 'float'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

**

https://github.com/aviolante/pyspark_dl_pipeline/blob/master/pyspark_dl_pipeline.ipynb

...