Ниже решение в искре для массива вместо json,
from pyspark.sql.functions import *
df1 = sc.parallelize([['Micheal','NY','head','XYZ','YN'], ['Micheal','NJ','head','XYZ','YM']]).toDF(("Employee", "Company Address", "designation", "company","Home Adress"))
df2 = df1.groupBy("Employee", "designation", "company").agg(collect_list(struct(col("Company Address"),col("Home Adress"))).alias("Address"))
df2.show(1,False)
Выход:
+--------+-----------+-------+--------------------+
|Employee|designation|company|Address |
+--------+-----------+-------+--------------------+
|Micheal |head |XYZ |[[NY, YN], [NJ, YM]]|
+--------+-----------+-------+--------------------+