Я играю с набором данных wine и получаю странный вывод из pyspark.ml.feature.Binarizer
. Иногда возвращает то, что ожидается, иногда пустые списки или кортежи.
from sklearn.datasets import load_boston
import pandas as pd
from pyspark.ml.feature import VectorAssembler, Binarizer
boston = load_boston()
data = boston.data
columns = boston.feature_names
df = spark.createDataFrame(pd.DataFrame(data))
df = df.toDF(*columns).select('INDUS', 'RAD', 'NOX')
vec = VectorAssembler(inputCols=df.columns, outputCol='VECTORIZER')
vector = vec.transform(df).select('VECTORIZER')
vector.show(5)
+----------------+
| VECTORIZER|
+----------------+
|[2.31,1.0,0.538]|
|[7.07,2.0,0.469]|
|[7.07,2.0,0.469]|
|[2.18,3.0,0.458]|
|[2.18,3.0,0.458]|
+----------------+
binarizer = Binarizer(inputCol='VECTORIZER', outputCol='BINARIZED', threshold=5)
binarizer.transform(vector).show(5)
+----------------+-------------+
| VECTORIZER| BINARIZED|
+----------------+-------------+
|[2.31,1.0,0.538]| (3,[],[])|
|[7.07,2.0,0.469]|[1.0,0.0,0.0]|
|[7.07,2.0,0.469]|[1.0,0.0,0.0]|
|[2.18,3.0,0.458]| (3,[],[])|
|[2.18,3.0,0.458]| (3,[],[])|
+----------------+-------------+
Почему верны только второй и третий ряд?