Я использую обработку данных spark sql для вложенного массива.
{
"isActive": true,
"sample": {
"someitem": {
"thesearecool": [{
"neat": "wow"
},
{
"neat": "tubular"
}
]
},
"coolcolors": [{
"color": "red",
"hex": "ff0000"
},
{
"color": "blue",
"hex": "0000ff"
}
]
}
}
схема:
root
|-- isActive: boolean (nullable = true)
|-- sample: struct (nullable = true)
| |-- coolcolors: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- color: string (nullable = true)
| | | |-- hex: string (nullable = true)
| |-- someitem: struct (nullable = true)
| | |-- thesearecool: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- neat: string (nullable = true)
код:
val nested1 = nested.withColumn("color_data", explode($"sample.coolcolors")).select("isActive","color_data.color","color_data.hex","sample.someitem.thesearecool.neat")
val nested2 = nested.withColumn("thesearecool_data", explode($"sample.someitem.thesearecool")).select("thesearecool_data.neat")
пример вывода :
+--------+-----+------+--------------+
|isActive|color|hex |neat |
+--------+-----+------+--------------+
|true |red |ff0000|[wow, tubular]|
|true |blue |0000ff|[wow, tubular]|
+--------+-----+------+--------------+
+-------+
|neat |
+-------+
|wow |
|tubular|
+-------+
нам нужно обработать данные одним результатом.