Я применяю глубокое обучение, используя конвейер pyspark. Последнее, но не менее важное:
TypeError: Invalid argument, not a string or column:
0.5555167887095438 of type <class 'float'>.
For column literals, use 'lit', 'array', 'struct' or 'create_map' function.
Я получил вышеуказанную ошибку. Как я могу преодолеть эту проблему? Код:
df_transform_fin = pipeline_model.select('features','label_index')
df_transform_fin.show()
+--------------------+-----------+
| features|label_index|
+--------------------+-----------+
|(54,[0,1,2,3,4,5,...| 1.0|
|(54,[0,1,2,3,4,5,...| 1.0|
|(54,[0,1,2,3,10,1...| 1.0|
|(54,[0,1,2,3,4,5,...| 1.0|
|(54,[0,1,2,3,10,1...| 1.0|
|(54,[0,1,2,3,10,1...| 1.0|
|(54,[0,1,2,3,4,5,...| 1.0|
|(54,[0,1,2,3,10,1...| 1.0|
|(54,[0,1,2,3,10,1...| 1.0|
|(54,[0,1,2,3,10,1...| 1.0|
|(54,[0,1,2,3,10,1...| 1.0|
|(54,[0,1,2,3,10,1...| 1.0|
|(54,[0,1,2,4,6,7,...| 1.0|
|(54,[0,1,2,3,4,5,...| 1.0|
|(54,[0,1,2,3,4,5,...| 1.0|
|(54,[0,1,2,4,6,7,...| 1.0|
|(54,[0,1,2,3,4,5,...| 1.0|
|(54,[0,1,2,3,4,5,...| 1.0|
|(54,[0,1,2,3,4,5,...| 1.0|
|(54,[0,1,2,3,4,5,...| 1.0|
+--------------------+-----------+
only showing top 20 rows
df_transform_fin.printSchema()
root
|-- features: vector (nullable = true)
|-- label_index: double (nullable = false)
def dl_pipeline_fit_score_results(dl_pipeline=dl_pipeline,
train_data=train_data,
test_data=test_data,
label='label_index'):
fit_dl_pipeline = dl_pipeline.fit(train_data)
pred_train = fit_dl_pipeline.transform(train_data)
pred_test = fit_dl_pipeline.transform(test_data)
pnl_train = pred_train.select(label, "prediction")
pnl_test = pred_test.select(label, "prediction")
pred_and_label_train = pnl_train.rdd.map(lambda row: (row[label], row['prediction']))
pred_and_label_test = pnl_test.rdd.map(lambda row: (row[label], row['prediction']))
metrics_train = MulticlassMetrics(pred_and_label_train)
metrics_test = MulticlassMetrics(pred_and_label_test)
print("Training Data Accuracy: {}".format(round(metrics_train.precision(),4)))
print("Training Data Confusion Matrix")
display(pnl_train.crosstab('label_index', 'prediction').toPandas())
print("\nTest Data Accuracy: {}".format(round(metrics_test.precision(),4)))
print("Test Data Confusion Matrix")
display(pnl_test.crosstab('label_index', 'prediction').toPandas())
**
dl_pipeline_fit_score_results(dl_pipeline=dl_pipeline,
train_data=train_data,
test_data=test_data,
label='label_index');
>>> Fit model
>>> Synchronous training complete.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-30-cb5b6595cf8f> in <module>()
2 train_data=train_data,
3 test_data=test_data,
----> 4 label='label_index');
2 frames
<ipython-input-29-4451e26ef03d> in dl_pipeline_fit_score_results(dl_pipeline, train_data, test_data, label)
18 metrics_test = MulticlassMetrics(pred_and_label_test)
19
---> 20 print("Training Data Accuracy: {}".format(round(metrics_train.precision(),4)))
21 print("Training Data Confusion Matrix")
22 display(pnl_train.crosstab('label_index', 'prediction').toPandas())
/content/drive/spark+keras/spark-2.4.4-bin-hadoop2.7/python/pyspark/sql/functions.py in round(col, scale)
600 """
601 sc = SparkContext._active_spark_context
--> 602 return Column(sc._jvm.functions.round(_to_java_column(col), scale))
603
604
/content/drive/spark+keras/spark-2.4.4-bin-hadoop2.7/python/pyspark/sql/column.py in _to_java_column(col)
51 "{0} of type {1}. "
52 "For column literals, use 'lit', 'array', 'struct' or 'create_map' "
---> 53 "function.".format(col, type(col)))
54 return jcol
55
TypeError: Invalid argument, not a string or column: 0.5555167887095438 of type <class 'float'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.
**
https://github.com/aviolante/pyspark_dl_pipeline/blob/master/pyspark_dl_pipeline.ipynb