Не нужно указывать each and every columns from structtype
в искре.
Мы можем извлечь все структурные ключи, указав struct_field.*
в .select
Example:
spark.read.json(Seq("""{"fields":{"custid":"1","password":"foo","role":"rr"}}""").toDS).printSchema
//schema
//root
// |-- fields: struct (nullable = true)
// | |-- custid: string (nullable = true)
// | |-- password: string (nullable = true)
// | |-- role: string (nullable = true)
//read the json data into Dataframe.
val df=spark.read.json(Seq("""{"fields":{"custid":"1","password":"foo","role":"rr"}}""").toDS)
//get all fields values extracted from fields struct
df.select("fields.*").show()
//+------+--------+----+
//|custid|password|role|
//+------+--------+----+
//| 1| foo| rr|
//+------+--------+----+
Больше динамических c способ сплющивания json здесь :
import org.apache.spark.sql.Column
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions.col
def flattenSchema(schema: StructType, prefix: String = null) : Array[Column] = {
schema.fields.flatMap(f => {
val colName = if (prefix == null) f.name else (prefix + "." + f.name)
f.dataType match {
case st: StructType => flattenSchema(st, colName)
case _ => Array(col(colName))
}
})
}
val df=spark.read.json(Seq("""{"fields":{"custid":"1","password":"foo","role":"rr","nested-2":{"id":"1"}}}""").toDS)
df.select(flattenSchema(df.schema):_*).show()
//+------+---+--------+----+
//|custid| id|password|role|
//+------+---+--------+----+
//| 1| 1| foo| rr|
//+------+---+--------+----+