Вы можете groupBy
в статусе, затем использовать collect_list
Example:
//sampledata
df.show()
//+------+
//|status|
//+------+
//| 1|
//| 1|
//| 2|
//| 3|
//| 3|
//| 2|
//| 1|
//+------+
df.groupBy("status").
agg(concat_ws(",",collect_list("status")).alias("group")).
show()
//+------+-----+
//|status|group|
//+------+-----+
//| 3| 3,3|
//| 1|1,1,1|
//| 2| 2,2|
//+------+-----+
//collect as array
df.groupBy("status").
agg(collect_list("status").alias("group")).
show()
//+------+---------+
//|status| group|
//+------+---------+
//| 3| [3, 3]|
//| 1|[1, 1, 1]|
//| 2| [2, 2]|
//+------+---------+