У меня есть два набора данных spark, один с столбцами accountid и key, ключевой столбец в формате массива [key1, key2, key3 ..] и другой набор данных с двумя столбцами accountid и значениями ключей, который находится в json.accountid, {ключ: значение, ключ, значение ...}.Мне нужно обновить значение во втором наборе данных, если ключ появляется для accountid в первом наборе данных.
import org.apache.spark.sql.functions._
val df= sc.parallelize(Seq(("20180610114049", "id1","key1"),
("20180610114049", "id2","key2"),
("20180610114049", "id1","key1"),
("20180612114049", "id2","key1"),
("20180613114049", "id3","key2"),
("20180613114049", "id3","key3")
)).toDF("date","accountid", "key")
val gp=df.groupBy("accountid","date").agg(collect_list("key"))
+---------+--------------+-----------------+
|accountid| date|collect_list(key)|
+---------+--------------+-----------------+
| id2|20180610114049| [key2]|
| id1|20180610114049| [key1, key1]|
| id3|20180613114049| [key2, key3]|
| id2|20180612114049| [key1]|
+---------+--------------+-----------------+
val df2= sc.parallelize(Seq(("20180610114049", "id1","{'key1':'0.0','key2':'0.0','key3':'0.0'}"),
("20180610114049", "id2","{'key1':'0.0','key2':'0.0','key3':'0.0'}"),
("20180611114049", "id1","{'key1':'0.0','key2':'0.0','key3':'0.0'}"),
("20180612114049", "id2","{'key1':'0.0','key2':'0.0','key3':'0.0'}"),
("20180613114049", "id3","{'key1':'0.0','key2':'0.0','key3':'0.0'}")
)).toDF("date","accountid", "result")
+--------------+---------+----------------------------------------+
|date |accountid|result |
+--------------+---------+----------------------------------------+
|20180610114049|id1 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180610114049|id2 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180611114049|id1 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180612114049|id2 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180613114049|id3 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
+--------------+---------+----------------------------------------+
ожидаемый результат
+--------------+---------+----------------------------------------+
|date |accountid|result |
+--------------+---------+----------------------------------------+
|20180610114049|id1 |{'key1':'1.0','key2':'0.0','key3':'0.0'}|
|20180610114049|id2 |{'key1':'0.0','key2':'1.0','key3':'0.0'}|
|20180611114049|id1 |{'key1':'0.0','key2':'0.0','key3':'0.0'}|
|20180612114049|id2 |{'key1':'1.0','key2':'0.0','key3':'0.0'}|
|20180613114049|id3 |{'key1':'0.0','key2':'1.0','key3':'1.0'}|
+--------------+---------+----------------------------------------+