Используйте .pivot
, groupBy
на id
, чтобы получить записи в одну запись.
df=spark.createDataFrame([("1","address_city","NewYork"),("1","address_address1","hotel road"),("1","address_postal","1345"),("2","address_city","NewJersey"),("2","address_postal","3421")],["id","value","name"])
#+---+----------------+----------+
#| id| value| name|
#+---+----------------+----------+
#| 1| address_city| NewYork|
#| 1|address_address1|hotel road|
#| 1| address_postal| 1345|
#| 2| address_city| NewJersey|
#| 2| address_postal| 3421|
#+---+----------------+----------+
df.groupBy("id").pivot("value").agg(first("name")).show()
#+---+----------------+------------+--------------+
#| id|address_address1|address_city|address_postal|
#+---+----------------+------------+--------------+
#| 1| hotel road| NewYork| 1345|
#| 2| null| NewJersey| 3421|
#+---+----------------+------------+--------------+
df.groupBy("id").pivot("value").agg(first("name")).collect()
#[Row(id=u'1', address_address1=u'hotel road', address_city=u'NewYork', address_postal=u'1345'), Row(id=u'2', address_address1=None, address_city=u'NewJersey', address_postal=u'3421')]