Вот один подход с Scala:
val df = Seq(
//store array from split
val withArrayDF = df.withColumn("array", split($"value", ",")).drop("value")
//get sorted unique values for the whole dataset
val distinctValues = withArrayDF.select(explode($"array")).distinct.collect.map{_.getString(0)}.sorted.toList
//foreach A,B,C create new column called ncol. When ncol is present in array(i) true otherwise false
distinctValues.map{ncol =>
withArrayDF.withColumn(ncol, array_contains($"array", ncol)).drop("array")
}.reduce(_.join(_,"id"))//join all A, B, C
.select("id", distinctValues:_*)
И вывод:
| id| A| B| C|
| 0| true| true|false|
| 1| true|false| true|
| 2|false| true|false|
И версия Python:
from pyspark.sql.functions import array_contains, split, when, col, explode
from functools import reduce
df = spark.createDataFrame(
(2,"B")], ["id","value"])
# store array from split
withArrayDF = df.withColumn("array", split(df["value"], ",")).drop("value")
# get sorted unique values for the whole dataset
distinctValues = sorted(
map(lambda row: row[0], withArrayDF.select(explode("array")).distinct().collect())))
# foreach A,B,C create new column called ncol. When ncol is present in array(i) true otherwise false
mappedDFs = list(
map(lambda ncol:
.withColumn(ncol, array_contains(col("array"), ncol))
finalDF = reduce(lambda x,y: x.join(y, "id"), mappedDFs)
| id| A| B| C|
| 0| true| true|false|
| 1| true|false| true|
| 2|false| true|false|