Вы также можете сделать это, используя pivot
:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
# Read data as an rdd
rdd = sc.textFile('xxxxxxxxxx')
# Map each row to the column it will be in
mapping = {'a': 'col3', 'c': 'col2', 'd': 'col4'}
df = rdd.map(lambda x: (mapping.get(x[0]), x)).toDF(['colname', 'value'])
# create a row indicator
df = df.withColumn('order', F.monotonically_increasing_id())
df = df.withColumn('row', F.expr("case when value='x' then 1 else 0 end"))
# cumulative sum to get row numbers
df = df.withColumn('rownum', F.sum("row").over(Window().orderBy('order')))
# get rid of row terminator
df = df.filter(df['value'] != 'x')
# pivot to get answer
df = df.groupby('rownum').pivot('colname').agg(F.first("value"))
df = df.select(F.lit("x").alias("col1"), "col2", "col3", "col4")
df.show()
+----+-----+------+--------+
|col1| col2| col3| col4|
+----+-----+------+--------+
| x| null|a: krb| null|
| x|c: HK| a: HP| d: T|
| x|c: CN|a: MSS|d: H-MSS|
+----+-----+------+--------+