С Spark-2.4+
используйте функции array_join
, sort_array
, transform
для этого случая.
#sample dataframe
df=spark.createDataFrame([(1,5,"This"),(2,10,"is"),(3,12,"a"),(1,7,"string"),(2,4,"oreo")],["usr","sec","scrpt"])
df.show()
#+---+---+------+
#|usr|sec| scrpt|
#+---+---+------+
#| 1| 5| This|
#| 2| 10| is|
#| 3| 12| a|
#| 1| 7|string|
#| 2| 4| oreo|
#+---+---+------+
df.groupBy("usr").agg(array_join(expr("""transform(sort_array(collect_list(struct(sec,scrpt)),True), x -> x.scrpt)""")," ").alias("concated")).orderBy("usr").show(10,False)
df.groupBy("usr").agg(concat_ws(" ",expr("""transform(sort_array(collect_list(struct(sec,scrpt)),True), x -> x.scrpt)""")).alias("concated")).orderBy("usr").show(10,False)
#+---+-----------+
#|usr|concated |
#+---+-----------+
#|1 |This string|
#|2 |oreo is |
#|3 |a |
#+---+-----------+
#lower case
df.groupBy("usr").agg(lower(array_join(expr("""transform(sort_array(collect_list(struct(sec,scrpt)),True), x -> x.scrpt)""")," ")).alias("concated")).orderBy("usr").show(10,False)
#+---+-----------+
#|usr|concated |
#+---+-----------+
#|1 |this string|
#|2 |oreo is |
#|3 |a |
#+---+-----------+