na.fill
не заполняет нулевые элементы внутри поля структуры в столбце массива.Один из подходов заключается в использовании UDF, как показано ниже:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
case class City(name: String, postcode: String)
val df = Seq(
("John", "Doe", Seq(City("New York", "A000000"), City("Warsaw", null))),
("John", "Smith", Seq(City("Berlin", null))),
("John", null, Seq(City("Paris", null)))
).toDF("firstname", "lastname", "cities")
val defaultStr = "unknown"
def patchNull(default: String) = udf( (s: Seq[Row]) =>
s.map( r => (r.getAs[String]("name"), r.getAs[String]("postcode")) match {
case (null, null) => (default, default)
case (c, null) => (c, default)
case (null, p) => (default, p)
case e => e
}
) )
df.
withColumn( "cities", patchNull(defaultStr)($"cities") ).
na.fill(defaultStr).
show(false)
// +---------+--------+--------------------------------------+
// |firstname|lastname|cities |
// +---------+--------+--------------------------------------+
// |John |Doe |[[New York,A000000], [Warsaw,unknown]]|
// |John |Smith |[[Berlin,unknown]] |
// |John |unknown |[[Paris,unknown]] |
// +---------+--------+--------------------------------------+