Как говорит Анкин, вы можете использовать MapType для этого:
import pyspark
from pyspark.sql import Row
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
data = spark.createDataFrame([Row(zip_code='58542', dma='MIN'),
Row(zip_code='58701', dma='MIN'),
Row(zip_code='57632', dma='MIN'),
Row(zip_code='58734', dma='MIN')])
data.show()
Выход:
+---+--------+
|dma|zip_code|
+---+--------+
|MIN| 58542|
|MIN| 58701|
|MIN| 57632|
|MIN| 58734|
+---+--------+
from pyspark.sql.functions import udf
from pyspark.sql import types as T
@udf(T.MapType(T.StringType(), T.StringType()))
def create_struct(zip_code, dma):
return {zip_code: dma}
data.withColumn('struct', create_struct(data.zip_code, data.dma)).toJSON().collect()
Выход:
['{"dma":"MIN","zip_code":"58542","struct":{"58542":"MIN"}}',
'{"dma":"MIN","zip_code":"58701","struct":{"58701":"MIN"}}',
'{"dma":"MIN","zip_code":"57632","struct":{"57632":"MIN"}}',
'{"dma":"MIN","zip_code":"58734","struct":{"58734":"MIN"}}']