%python
from pyspark.sql.functions import *
from pyspark.sql.types import *
# Gen some data
df1 = spark.createDataFrame([ ( 1, list(['A','B','X']), list(['1', '2', '8']) ) for x in range(2)], ['value1', 'array1', 'array2'] )
df2 = spark.createDataFrame([ ( 2, list(['C','D','Y']), list(['3', '4', '9']) ) for x in range(2)], ['value1', 'array1', 'array2'] )
df = df1.union(df2).distinct()
# from here specifically for you
col_temp_expr = "transform(array1, (x, i) -> concat(x, ',', array2[i]))"
dfA = df.withColumn("col_temp", expr(col_temp_expr))
dfB = dfA.select("value1", "array2", explode((col("col_temp")))) # Not an array
dfC = dfB.withColumn('tempArray', split(dfB['col'], ',')) # Now an array
dfC.select("value1", dfC.tempArray[0], dfC.tempArray[1]).show()
возвращает:
+------+------------+------------+
|value1|tempArray[0]|tempArray[1]|
+------+------------+------------+
| 1| A| 1|
| 1| B| 2|
| 1| X| 8|
| 2| C| 3|
| 2| D| 4|
| 2| Y| 9|
+------+------------+------------+
Вы можете переименовать столбцы. Это было больше элементов на массив.