В этом решении использовалось collect_list()
, но не уверен, что оно удовлетворяет вашим требованиям.
myValues = [(1,'C'),(1,'F'),(1,'K'),(1,'L'),(2,'A'),(2,'B'),(2,'C')]
df = sqlContext.createDataFrame(myValues,['Aggregate_Over_This','Things_to_Agg_in_Array'])
df.show()
+-------------------+----------------------+
|Aggregate_Over_This|Things_to_Agg_in_Array|
+-------------------+----------------------+
| 1| C|
| 1| F|
| 1| K|
| 1| L|
| 2| A|
| 2| B|
| 2| C|
+-------------------+----------------------+
df.registerTempTable('table_view')
df1=sqlContext.sql(
'select Aggregate_Over_This, Things_to_Agg_in_Array, collect_list(Things_to_Agg_in_Array) over (partition by Aggregate_Over_This) as aray_output from table_view'
)
df1.show()
+-------------------+----------------------+------------+
|Aggregate_Over_This|Things_to_Agg_in_Array| aray_output|
+-------------------+----------------------+------------+
| 1| C|[C, F, K, L]|
| 1| F|[C, F, K, L]|
| 1| K|[C, F, K, L]|
| 1| L|[C, F, K, L]|
| 2| A| [A, B, C]|
| 2| B| [A, B, C]|
| 2| C| [A, B, C]|
+-------------------+----------------------+------------+