Использование OneHotEncoderEstimator()
spark.version
'2.4.3'
df = spark.createDataFrame([(1, 30), (2, 25), (3, 21),],["id", "age"])
# buckets
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splits=[20,24,27,30],inputCol="age", outputCol="age_bucket", handleInvalid="keep")
buckets = bucketizer.transform(df)
buckets.show()
+---+---+----------+
| id|age|age_bucket|
+---+---+----------+
| 1| 30| 2.0|
| 2| 25| 1.0|
| 3| 21| 0.0|
+---+---+----------+
# ohe
from pyspark.ml.feature import OneHotEncoderEstimator
encoder = OneHotEncoderEstimator(inputCols=["age_bucket"], outputCols=["age_ohe"])
model = encoder.fit(buckets)
transform_model = model.transform(buckets)
transform_model.show()
+---+---+----------+-------------+
| id|age|age_bucket| age_ohe|
+---+---+----------+-------------+
| 1| 30| 2.0| (2,[],[])|
| 2| 25| 1.0|(2,[1],[1.0])|
| 3| 21| 0.0|(2,[0],[1.0])|
+---+---+----------+-------------+
# wrap it up in a pipeline if you want
from pyspark.ml import Pipeline
bucketizer = Bucketizer(splits=[20,24,27,30], inputCol="age", outputCol="age_bucket")
encoder = OneHotEncoderEstimator(inputCols=["age_bucket"], outputCols=["age_ohe"])
pipeline = Pipeline(stages=[bucketizer, encoder])
model = pipeline.fit(df)
fe = model.transform(df)
fe.show()
+---+---+----------+-------------+
| id|age|age_bucket| age_ohe|
+---+---+----------+-------------+
| 1| 30| 2.0| (2,[],[])|
| 2| 25| 1.0|(2,[1],[1.0])|
| 3| 21| 0.0|(2,[0],[1.0])|
+---+---+----------+-------------+